rapidsai · rjzamora · Jun 5, 2025 · Jun 6, 2025 · Jun 9, 2025 · Jun 9, 2025
@@ -1,4 +1,4 @@
-# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+# SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES.
 # SPDX-License-Identifier: Apache-2.0
 
 """Traversal and visitor utilities for nodes."""
@@ -49,6 +49,40 @@ def traversal(nodes: Sequence[NodeT]) -> Generator[NodeT, None, None]:
                 lifo.append(child)
 
 
+def post_traversal(nodes: Sequence[NodeT]) -> Generator[NodeT, None, None]:
+    """
+    Post-order traversal of nodes in an expression.
+
+    Parameters
+    ----------
+    nodes
+        Roots of expressions to traverse.
+
+    Yields
+    ------
+    Unique nodes in the expressions, child before parent, children
+    in-order from left to right.
+    """
+    seen = set()
+    lifo = []
+
+    for node in nodes:
+        if node not in seen:
+            lifo.append(node)
+            seen.add(node)
+
+    while lifo:
+        node = lifo[-1]
+        for child in node.children:
+            if child not in seen:
+                lifo.append(child)
+                seen.add(child)
+                break
+        else:
+            yield node
+            lifo.pop()
+
+
 def reuse_if_unchanged(node: NodeT, fn: GenericTransformer[NodeT, NodeT]) -> NodeT:
     """
     Recipe for transforming nodes that returns the old object if unchanged.

@@ -7,9 +7,10 @@
 from typing import TYPE_CHECKING, Any
 
 if TYPE_CHECKING:
-    from collections.abc import Generator, Iterator
+    from collections.abc import Callable, Generator, Iterator
 
     from cudf_polars.dsl.expr import NamedExpr
+    from cudf_polars.dsl.ir import IR
     from cudf_polars.dsl.nodebase import Node
 
 
@@ -44,3 +45,133 @@ def __rich_repr__(self) -> Generator[Any, None, None]:
 def get_key_name(node: Node) -> str:
     """Generate the key name for a Node."""
     return f"{type(node).__name__.lower()}-{hash(node)}"
+
+
+class UniqueSourceStats:
+    """
+    Unique source statistics.
+
+    Parameters
+    ----------
+    count
+        Unique-value count.
+    fraction
+        Unique-value fraction.
+    """
+
+    __slots__ = ("count", "fraction")
+
+    def __init__(
+        self,
+        *,
+        count: int | None = None,
+        fraction: float | None = None,
+    ):
+        self.count = count
+        self.fraction = fraction
+
+
+class ColumnSourceStats:
+    """
+    Column source statistics.
+
+    Parameters
+    ----------
+    cardinality
+        Cardinality (row count).
+    unique_stats
+        Unique-value statistics.
+    storage_size_per_file
+        Average un-compressed storage size for this
+        column in a single file. This value is used to
+        calculate the partition count for an IR node.
+    exact
+        Tuple of attributes that have not been estimated
+        by partial sampling, and are known exactly,
+
+    Notes
+    -----
+    Source statistics are statistics coming from "source"
+    nodes like ``Scan` and ``DataFrameScan``.
+    """
+
+    __slots__ = (
+        "_unique_stats",
+        "cardinality",
+        "exact",
+        "storage_size_per_file",
+    )
+
+    def __init__(
+        self,
+        *,
+        cardinality: int | None = None,
+        storage_size_per_file: int | None = None,
+        exact: tuple[str, ...] = (),
+        unique_stats: Any = None,
-        unique_stats: Any = None,
+        unique_stats: Callable[[], UniqueSourceStats] | UniqueSourceStates | None = None,
-        unique_stats: Any = None,
+        unique_stats: Callable[[], UniqueSourceStats] | UniqueSourceStates | None = None,
+    ):
+        self.cardinality = cardinality
+        self.storage_size_per_file = storage_size_per_file
+        self.exact = exact
+        self._unique_stats: Callable[..., UniqueSourceStats] | UniqueSourceStats
+        if unique_stats is None:
+            self._unique_stats = UniqueSourceStats()
+        elif isinstance(unique_stats, UniqueSourceStats) or callable(unique_stats):
+            self._unique_stats = unique_stats
+        else:  # pragma: no cover
+            raise TypeError(f"Unexpected unique_stats argument, got {unique_stats}")
+
+    @property
+    def unique_stats(self) -> UniqueSourceStats:
+        """Get unique-value statistics."""
+        if callable(self._unique_stats):
+            return self._unique_stats()
+        return self._unique_stats
+
+    @property
+    def unique_count(self) -> int | None:
+        """Get unique count."""
+        return self.unique_stats.count
+
+    @property
+    def unique_fraction(self) -> float | None:
+        """Get unique fraction."""
+        return self.unique_stats.fraction
+
+
+class ColumnStats:
+    """
+    Column statistics.
+
+    Parameters
+    ----------
+    name
+        Column name.
+    unique_count
+        Unique-count estimate.
+    source_stats
+        Column-source statistics.
+    """
+
+    __slots__ = ("name", "source_stats", "unique_count")
+
+    def __init__(
+        self,
+        *,
+        name: str | None = None,
+        unique_count: int | None = None,
+        source_stats: ColumnSourceStats | None = None,
+    ) -> None:
+        self.name = name
+        self.unique_count = unique_count
+        self.source_stats = source_stats
+
+
+class StatsCollector:
+    """Column statistics collector."""
+
+    __slots__ = ("cardinality", "column_stats")
+
+    def __init__(self) -> None:
+        self.cardinality: dict[IR, int] = {}
+        self.column_stats: dict[IR, dict[str, ColumnStats]] = {}
@@ -288,7 +288,7 @@ def get_executor_options(
         and benchmark.__name__ == "PDSHQueries"
         and run_config.executor == "streaming"
     ):
-        executor_options["cardinality_factor"] = {
+        executor_options["unique_fraction"] = {
             "c_custkey": 0.05,
             "l_orderkey": 1.0,
             "l_partkey": 0.1,

@@ -1,4 +1,4 @@
-# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+# SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES.
 # SPDX-License-Identifier: Apache-2.0
 """Multi-partition dispatch functions."""
 
@@ -12,8 +12,9 @@
     from typing import TypeAlias
 
     from cudf_polars.dsl.ir import IR
-    from cudf_polars.experimental.base import PartitionInfo
+    from cudf_polars.experimental.base import PartitionInfo, StatsCollector
     from cudf_polars.typing import GenericTransformer
+    from cudf_polars.utils.config import ConfigOptions
 
 
 LowerIRTransformer: TypeAlias = (
@@ -82,3 +83,23 @@ def generate_ir_tasks(
     task_graph
     """
     raise AssertionError(f"Unhandled type {type(ir)}")  # pragma: no cover
+
+
+@singledispatch
+def add_source_stats(
+    ir: IR, stats: StatsCollector, config_options: ConfigOptions
+) -> None:
+    """
+    Add basic source statistics for an IR node.
+
+    Parameters
+    ----------
+    ir
+        The IR node to collect source statistics for.
+    stats
+        The `StatsCollector` object to update with new
+        source statistics.
+    config_options
+        GPUEngine configuration options.
+    """
+    raise AssertionError(f"Unhandled type {type(ir)}")  # pragma: no cover
@@ -13,7 +13,11 @@
 from cudf_polars.dsl.ir import Distinct
 from cudf_polars.experimental.base import PartitionInfo
 from cudf_polars.experimental.dispatch import lower_ir_node
-from cudf_polars.experimental.utils import _fallback_inform, _lower_ir_fallback
+from cudf_polars.experimental.utils import (
+    _fallback_inform,
+    _get_unique_fractions,
+    _lower_ir_fallback,
+)
 
 if TYPE_CHECKING:
     from collections.abc import MutableMapping
@@ -29,7 +33,7 @@ def lower_distinct(
     partition_info: MutableMapping[IR, PartitionInfo],
     config_options: ConfigOptions,
     *,
-    cardinality: float | None = None,
+    unique_fraction: float | None = None,
 ) -> tuple[IR, MutableMapping[IR, PartitionInfo]]:
     """
     Lower a Distinct IR into partition-wise stages.
@@ -46,8 +50,8 @@ def lower_distinct(
         associated partitioning information.
     config_options
         GPUEngine configuration options.
-    cardinality
-        Cardinality factor to use for algorithm selection.
+    unique_fraction
+        Fractional unique count to use for algorithm selection.
 
     Returns
     -------
@@ -112,14 +116,14 @@ def lower_distinct(
             # partitions. For now, we raise an error to fall back
             # to one partition.
             raise NotImplementedError("Unsupported slice for multiple partitions.")
-    elif cardinality is not None:
-        # Use cardinality to determine partitioningcardinality
-        n_ary = min(max(int(1.0 / cardinality), 2), child_count)
-        output_count = max(int(cardinality * child_count), 1)
+    elif unique_fraction is not None:
+        # Use unique_fraction to determine partitioning
+        n_ary = min(max(int(1.0 / unique_fraction), 2), child_count)
+        output_count = max(int(unique_fraction * child_count), 1)
 
     if output_count > 1 and require_tree_reduction:
         # Need to reduce down to a single partition even
-        # if the cardinality is large.
+        # if the unique_fraction is large.
         output_count = 1
         _fallback_inform(
             "Unsupported unique options for multiple partitions.",
@@ -164,24 +168,30 @@ def _(
     # Extract child partitioning
     child, partition_info = rec(ir.children[0])
     config_options = rec.state["config_options"]
+    column_stats = rec.state["stats"].column_stats.get(ir.children[0], {})
+
     assert config_options.executor.name == "streaming", (
         "'in-memory' executor not supported in 'lower_ir_node'"
     )
 
-    subset: frozenset = ir.subset or frozenset(ir.schema)
-    cardinality_factor = {
-        c: max(min(f, 1.0), 0.00001)
-        for c, f in config_options.executor.cardinality_factor.items()
-        if c in subset
-    }
-    cardinality = max(cardinality_factor.values()) if cardinality_factor else None
+    subset: frozenset[str] = ir.subset or frozenset(ir.schema)
+    unique_fraction_dict = _get_unique_fractions(
+        tuple(subset),
+        config_options.executor.unique_fraction,
+        column_stats,
+    )
+
+    unique_fraction = (
+        max(unique_fraction_dict.values()) if unique_fraction_dict else None
+    )
+
     try:
         return lower_distinct(
             ir,
             child,
             partition_info,
             config_options,
-            cardinality=cardinality,
+            unique_fraction=unique_fraction,
         )
     except NotImplementedError as err:
         return _lower_ir_fallback(ir, rec, msg=str(err))