ray-project · alexeykudinkin · Sep 26, 2025 · Sep 26, 2025 · Sep 26, 2025 · Sep 26, 2025
@@ -1,6 +1,6 @@
 import logging
 import math
-from typing import TYPE_CHECKING, Any, Dict, List, Optional, Set, Tuple
+from typing import TYPE_CHECKING, Any, Dict, List, Optional, Set, Tuple, Type
 
 from ray._private.arrow_utils import get_pyarrow_version
 from ray.air.util.transform_pyarrow import _is_pa_extension_type
@@ -345,7 +345,16 @@ def __init__(
         right_columns_suffix: Optional[str] = None,
         partition_size_hint: Optional[int] = None,
         aggregator_ray_remote_args_override: Optional[Dict[str, Any]] = None,
+        shuffle_aggregation_type: Optional[Type[StatefulShuffleAggregation]] = None,
     ):
+        if shuffle_aggregation_type is not None:
+            if not issubclass(shuffle_aggregation_type, StatefulShuffleAggregation):
+                raise TypeError(
+                    f"shuffle_aggregation_type must be a subclass of StatefulShuffleAggregation, "
+                    f"got {shuffle_aggregation_type}"
+                )
+
+        aggregation_class = shuffle_aggregation_type or JoiningShuffleAggregation
         super().__init__(
             name_factory=(
                 lambda num_partitions: f"Join(num_partitions={num_partitions})"
@@ -356,7 +365,7 @@ def __init__(
             num_partitions=num_partitions,
             partition_size_hint=partition_size_hint,
             partition_aggregation_factory=(
-                lambda aggregator_id, target_partition_ids: JoiningShuffleAggregation(
+                lambda aggregator_id, target_partition_ids: aggregation_class(
                     aggregator_id=aggregator_id,
                     join_type=join_type,
                     left_key_col_names=left_key_columns,

@@ -2757,7 +2757,7 @@ def unique(self, column: str) -> List[Any]:
 
             >>> import ray
             >>> ds = ray.data.from_items([1, 2, 3, 2, 3])
-            >>> ds.unique("item")
+            >>> sorted(ds.unique("item"))
             [1, 2, 3]
 
             This function is very useful for computing labels
@@ -3032,11 +3032,12 @@ def std(
             >>> import ray
             >>> round(ray.data.range(100).std("id", ddof=0), 5)
             28.86607
-            >>> ray.data.from_items([
+            >>> result = ray.data.from_items([
             ...     {"A": i, "B": i**2}
             ...     for i in range(100)
             ... ]).std(["A", "B"])
-            {'std(A)': 29.011491975882016, 'std(B)': 2968.1748039269296}
+            >>> [(key, round(value, 10)) for key, value in result.items()]
+            [('std(A)', 29.0114919759), ('std(B)', 2968.1748039269)]
 
         Args:
             on: a column name or a list of column names to aggregate.