ray-project
diff --git a/‎ci/raydepsets/cli.py‎
Lines changed: 28 additions & 0 deletions b/‎ci/raydepsets/cli.py‎
Lines changed: 28 additions & 0 deletions
diff --git a/‎doc/source/data/api/aggregate.rst‎
Lines changed: 1 addition & 0 deletions b/‎doc/source/data/api/aggregate.rst‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎doc/source/serve/advanced-guides/multi-app-container.md‎
Lines changed: 2 additions & 0 deletions b/‎doc/source/serve/advanced-guides/multi-app-container.md‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎python/ray/data/aggregate.py‎
Lines changed: 58 additions & 0 deletions b/‎python/ray/data/aggregate.py‎
Lines changed: 58 additions & 0 deletions
diff --git a/‎python/ray/data/read_api.py‎
Lines changed: 1 addition & 1 deletion b/‎python/ray/data/read_api.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎python/ray/data/tests/test_groupby_e2e.py‎
Lines changed: 12 additions & 0 deletions b/‎python/ray/data/tests/test_groupby_e2e.py‎
Lines changed: 12 additions & 0 deletions
@@ -96,6 +96,15 @@ def __init__(
         check: Optional[bool] = False,
         build_all_configs: Optional[bool] = False,
     ):
+        """Initialize the dependency set manager.
+
+        Args:
+            config_path: Path to the depsets config file.
+            workspace_dir: Path to the workspace directory.
+            uv_cache_dir: Directory to cache uv dependencies.
+            check: Whether to check if lock files are up to date.
+            build_all_configs: Whether to build all configs or just the specified one.
+        """
         self.workspace = Workspace(workspace_dir)
         self.config = self.workspace.load_configs(config_path)
         self.config_name = os.path.basename(config_path)
@@ -109,6 +118,7 @@ def __init__(
             self.copy_to_temp_dir()
 
     def get_output_paths(self) -> List[Path]:
+        """Get all output paths for depset nodes in topological order."""
         output_paths = []
         for node in topological_sort(self.build_graph):
             if self.build_graph.nodes[node]["node_type"] == "depset":
@@ -126,6 +136,7 @@ def copy_to_temp_dir(self):
             )
 
     def get_diffs(self) -> List[str]:
+        """Compare current lock files with previously saved copies and return unified diffs."""
         diffs = []
         for output_path in self.output_paths:
             new_lock_file_fp, old_lock_file_fp = self.get_source_and_dest(output_path)
@@ -142,6 +153,7 @@ def get_diffs(self) -> List[str]:
         return diffs
 
     def diff_lock_files(self):
+        """Check if lock files are up to date and raise an error if not."""
         diffs = self.get_diffs()
         if len(diffs) > 0:
             raise RuntimeError(
@@ -151,9 +163,11 @@ def diff_lock_files(self):
         click.echo("Lock files are up to date.")
 
     def get_source_and_dest(self, output_path: str) -> tuple[Path, Path]:
+        """Get the source workspace path and temporary destination path for a lock file."""
         return (self.get_path(output_path), (Path(self.temp_dir) / output_path))
 
     def _build(self, build_all_configs: Optional[bool] = False):
+        """Build the dependency graph from config depsets."""
         for depset in self.config.depsets:
             if depset.operation == "compile":
                 self.build_graph.add_node(
@@ -201,11 +215,13 @@ def _build(self, build_all_configs: Optional[bool] = False):
             self.subgraph_config_nodes()
 
     def subgraph_dependency_nodes(self, depset_name: str):
+        """Reduce the build graph to only include the specified depset and its ancestors."""
         dependency_nodes = networkx_ancestors(self.build_graph, depset_name)
         nodes = dependency_nodes | {depset_name}
         self.build_graph = self.build_graph.subgraph(nodes).copy()
 
     def subgraph_config_nodes(self):
+        """Reduce the build graph to nodes matching the current config and their ancestors."""
         # Get all nodes that have the target config name
         config_nodes = [
             node
@@ -224,6 +240,7 @@ def subgraph_config_nodes(self):
         self.build_graph = self.build_graph.subgraph(nodes).copy()
 
     def execute(self, single_depset_name: Optional[str] = None):
+        """Execute all depsets in topological order, optionally limited to a single depset."""
         if single_depset_name:
             # check if the depset exists
             _get_depset(self.config.depsets, single_depset_name)
@@ -240,6 +257,7 @@ def execute(self, single_depset_name: Optional[str] = None):
     def exec_uv_cmd(
         self, cmd: str, args: List[str], stdin: Optional[bytes] = None
     ) -> str:
+        """Execute a uv pip command with the given arguments."""
         cmd = [self._uv_binary, "pip", cmd, *args]
         click.echo(f"Executing command: {' '.join(cmd)}")
         status = subprocess.run(
@@ -252,6 +270,7 @@ def exec_uv_cmd(
         return status.stdout.decode("utf-8")
 
     def execute_pre_hook(self, pre_hook: str):
+        """Execute a pre-hook shell command."""
         status = subprocess.run(
             shlex.split(pre_hook),
             cwd=self.workspace.dir,
@@ -265,6 +284,7 @@ def execute_pre_hook(self, pre_hook: str):
         click.echo(f"Executed pre_hook {pre_hook} successfully")
 
     def execute_depset(self, depset: Depset):
+        """Execute a single depset based on its operation type (compile, subset, or expand)."""
         if depset.operation == "compile":
             self.compile(
                 constraints=depset.constraints,
@@ -389,15 +409,18 @@ def expand(
         )
 
     def read_lock_file(self, file_path: Path) -> List[str]:
+        """Read and return the contents of a lock file as a list of lines."""
         if not file_path.exists():
             raise RuntimeError(f"Lock file {file_path} does not exist")
         with open(file_path, "r") as f:
             return f.readlines()
 
     def get_path(self, path: str) -> Path:
+        """Convert a relative path to an absolute path within the workspace."""
         return Path(self.workspace.dir) / path
 
     def check_subset_exists(self, source_depset: Depset, requirements: List[str]):
+        """Verify that all requirements exist in the source depset."""
         for req in requirements:
             if req not in self.get_expanded_depset_requirements(source_depset.name, []):
                 raise RuntimeError(
@@ -424,15 +447,18 @@ def get_expanded_depset_requirements(
         return list(set(requirements_list))
 
     def cleanup(self):
+        """Remove the temporary directory used for lock file comparisons."""
         if self.temp_dir:
             shutil.rmtree(self.temp_dir)
 
 
 def _get_bytes(packages: List[str]) -> bytes:
+    """Convert a list of package names to newline-separated UTF-8 bytes."""
     return ("\n".join(packages) + "\n").encode("utf-8")
 
 
 def _get_depset(depsets: List[Depset], name: str) -> Depset:
+    """Find and return a depset by name from a list of depsets."""
     for depset in depsets:
         if depset.name == name:
             return depset
@@ -452,6 +478,7 @@ def _flatten_flags(flags: List[str]) -> List[str]:
 
 
 def _override_uv_flags(flags: List[str], args: List[str]) -> List[str]:
+    """Override existing uv flags in args with new values from flags."""
     flag_names = {f.split()[0] for f in flags if f.startswith("--")}
     new_args = []
     skip_next = False
@@ -468,6 +495,7 @@ def _override_uv_flags(flags: List[str], args: List[str]) -> List[str]:
 
 
 def _uv_binary():
+    """Get the path to the uv binary for the current platform."""
     r = runfiles.Create()
     system = platform.system()
     processor = platform.processor()
 
@@ -25,6 +25,7 @@ compute aggregations.
     AbsMax
     Quantile
     Unique
+    CountDistinct
     ValueCounter
     MissingValuePercentage
     ZeroPercentage
 
@@ -167,3 +167,5 @@ If raylet is running inside a container, then that container needs the necessary
   * This error should only occur when you're running the Ray cluster inside a container. If you see this error when starting the replica actor, try volume mounting `/var/lib/containers` in the container that runs raylet. That is, add `-v /var/lib/containers:/var/lib/containers` to the command that starts the Docker container.
 * **cannot clone: Operation not permitted; Error: cannot re-exec process**
   * This error should only occur when you're running the Ray cluster inside a container. This error implies that you don't have the permissions to use Podman to start a container. You need to start the container that runs raylet, with privileged permissions by adding `--privileged`.
+* **Very slow or hanging container startup**
+  * This is typically caused by using the default podman storage driver (`vfs`) with large container images. Podman runs in rootless mode, so its startup sequence involves modifying permissions of files in the container. The default storage driver is very slow to do this. Try configuring podman to use the `overlay` storage driver instead. You may need to also configure the `mount_program` to point to `/usr/bin/fuse-overlayfs` (or your appropriate local path).
@@ -1032,6 +1032,64 @@ def _normalize_nans(x: Collection) -> Set:
         return {v if not (isinstance(v, float) and np.isnan(v)) else np.nan for v in x}
 
 
+@PublicAPI
+class CountDistinct(Unique):
+    """Defines distinct count aggregation.
+
+    This aggregation computes the count of distinct values in a column.
+    It is similar to SQL's COUNT(DISTINCT column_name) operation.
+
+    Example:
+
+        .. testcode::
+
+            import ray
+            from ray.data.aggregate import CountDistinct
+
+            # Create a dataset with repeated values
+            ds = ray.data.from_items([
+                {"category": "A"}, {"category": "B"}, {"category": "A"},
+                {"category": "C"}, {"category": "A"}, {"category": "B"}
+            ])
+
+            # Count distinct categories
+            result = ds.aggregate(CountDistinct(on="category"))
+            # result: {'count_distinct(category)': 3}
+
+            # Using with groupby
+            ds = ray.data.from_items([
+                {"group": "X", "category": "A"}, {"group": "X", "category": "B"},
+                {"group": "Y", "category": "A"}, {"group": "Y", "category": "A"}
+            ])
+            result = ds.groupby("group").aggregate(CountDistinct(on="category")).take_all()
+            # result: [{'group': 'X', 'count_distinct(category)': 2},
+            #          {'group': 'Y', 'count_distinct(category)': 1}]
+
+    Args:
+        on: The name of the column to count distinct values on.
+        ignore_nulls: Whether to ignore null values when counting distinct items.
+                      Default is True (nulls are excluded from the count).
+        alias_name: Optional name for the resulting column. If not provided,
+            defaults to "count_distinct({on})".
+    """
+
+    def __init__(
+        self,
+        on: str,
+        ignore_nulls: bool = True,
+        alias_name: Optional[str] = None,
+    ):
+        super().__init__(
+            on=on,
+            ignore_nulls=ignore_nulls,
+            alias_name=alias_name if alias_name else f"count_distinct({str(on)})",
+        )
+
+    def finalize(self, accumulator: Set[Any]) -> int:
+        """Return the count of distinct values."""
+        return len(accumulator)
+
+
 @PublicAPI
 class ValueCounter(AggregateFnV2):
     """Counts the number of times each value appears in a column.
 
@@ -4089,7 +4089,7 @@ def read_unity_catalog(
 
     This function works by leveraging Unity Catalog's credential vending feature, which grants temporary, least-privilege
     credentials for the cloud storage location backing the requested table or data files. It authenticates via the Unity Catalog
-    REST API (Unity Catalog credential vending for external system access, `Databricks Docs <https://docs.databricks.com/en/data-governance/unity-catalog/credential-vending.html>`_),
+    REST API (Unity Catalog credential vending for external system access, `Databricks Docs <https://docs.databricks.com/en/external-access/credential-vending.html>`_),
     ensuring that permissions are enforced at the Databricks principal (user, group, or service principal) making the request.
     The function supports reading data directly from AWS S3, Azure Data Lake, or GCP GCS in standard formats including Delta and Parquet.
 
 
@@ -21,6 +21,7 @@
     AbsMax,
     AggregateFn,
     Count,
+    CountDistinct,
     Max,
     Mean,
     Min,
@@ -508,6 +509,7 @@ def test_groupby_arrow_multi_agg(
         .aggregate(
             Count(),
             Count("B"),
+            CountDistinct("B"),
             Sum("B"),
             Min("B"),
             Max("B"),
@@ -526,6 +528,7 @@ def test_groupby_arrow_multi_agg(
             "B": [
                 "count",
                 "count",
+                "nunique",
                 "sum",
                 "min",
                 "max",
@@ -542,6 +545,7 @@ def test_groupby_arrow_multi_agg(
         "A",
         "count()",
         "count(B)",
+        "count_distinct(B)",
         "sum(B)",
         "min(B)",
         "max(B)",
@@ -637,6 +641,9 @@ def test_groupby_multi_agg_with_nans(
         .groupby("A")
         .aggregate(
             Count("B", alias_name="count_b", ignore_nulls=ignore_nulls),
+            CountDistinct(
+                "B", alias_name="count_distinct_b", ignore_nulls=ignore_nulls
+            ),
             Sum("B", alias_name="sum_b", ignore_nulls=ignore_nulls),
             Min("B", alias_name="min_b", ignore_nulls=ignore_nulls),
             Max("B", alias_name="max_b", ignore_nulls=ignore_nulls),
@@ -654,6 +661,7 @@ def test_groupby_multi_agg_with_nans(
         {
             "B": [
                 ("count_b", lambda s: s.count() if ignore_nulls else len(s)),
+                ("count_distinct_b", lambda s: s.nunique(dropna=ignore_nulls)),
                 ("sum_b", lambda s: s.sum(skipna=ignore_nulls)),
                 ("min_b", lambda s: s.min(skipna=ignore_nulls)),
                 ("max_b", lambda s: s.max(skipna=ignore_nulls)),
@@ -674,6 +682,7 @@ def test_groupby_multi_agg_with_nans(
     grouped_df.columns = [
         "A",
         "count_b",
+        "count_distinct_b",
         "sum_b",
         "min_b",
         "max_b",
@@ -744,6 +753,7 @@ def test_groupby_aggregations_are_associative(
 
     aggs = [
         Count("B", alias_name="count_b", ignore_nulls=ignore_nulls),
+        CountDistinct("B", alias_name="count_distinct_b", ignore_nulls=ignore_nulls),
         Sum("B", alias_name="sum_b", ignore_nulls=ignore_nulls),
         Min("B", alias_name="min_b", ignore_nulls=ignore_nulls),
         Max("B", alias_name="max_b", ignore_nulls=ignore_nulls),
@@ -759,6 +769,7 @@ def test_groupby_aggregations_are_associative(
         {
             "B": [
                 ("count", lambda s: s.count() if ignore_nulls else len(s)),
+                ("count_distinct", lambda s: s.nunique(dropna=ignore_nulls)),
                 ("sum", lambda s: s.sum(skipna=ignore_nulls, min_count=1)),
                 ("min", lambda s: s.min(skipna=ignore_nulls)),
                 ("max", lambda s: s.max(skipna=ignore_nulls)),
@@ -779,6 +790,7 @@ def test_groupby_aggregations_are_associative(
     grouped_df.columns = [
         "A",
         "count_b",
+        "count_distinct_b",
         "sum_b",
         "min_b",
         "max_b",