[Data] Add approximate quantile to aggregator (#57598)

owenowenisme · bveeramani · web-flow · commit 81cf3518f241 · 2025-10-16T16:00:57.000-07:00
## Why are these changes needed? Add ApproximateQuantile aggregator to Ray Data using DataSketches KLL. Reason: • Enables efficient support for the summary API. • More scalable than exact Quantile on large datasets. Note: • DataSketches is not added as a Ray dependency; if missing, users are prompted to install it. --- Here's a simple test to show the efficiency difference between `ApproximateQuantile` and `Quantile` ```py import ray import ray.data import time ray.init(num_cpus=16) from ray.data.aggregate import ApproximateQuantile, Quantile ds = ray.data.range(10**8) start_time = time.time() print(ds.aggregate(ApproximateQuantile(on="id", quantiles=[0.5]))) print(f"Time taken ApproximateQuantile: {time.time() - start_time} seconds") ds = ray.data.range(10**8) start_time = time.time() print(ds.aggregate(Quantile(on="id", q=0.5))) print(f"Time taken Quantile: {time.time() - start_time} seconds") ``` In this run with 1e8 rows, the approximate median returned 49,979,428.0 in ~12.46s, while the exact Quantile returned 49,999,999.5 in ~163.33s. The difference reflects the sketch’s accuracy trade-off for significant speed and scalability gains. When k=800 (the default), we are guaranteed to have the error rate < 0.45% , in this test our error rate is `(49,999,999.5-49,979,428.0)/49,999,999.5`= 0.00041143 = 0.041143% which is < 0.45% , but we get the approximate median **13.11x** faster. ``` {'approx_quantile(id)': [49979428.0]} Time taken ApproximateQuantile: 12.457247257232666 seconds {'quantile(id)': 49999999.5} Time taken Quantile: 163.32705521583557 seconds ```  ## Related issue number  ## Checks - [ ] I've signed off every commit(by using the -s flag, i.e., `git commit -s`) in this PR. - [ ] I've run pre-commit jobs to lint the changes in this PR. ([pre-commit setup](https://docs.ray.io/en/latest/ray-contribute/getting-involved.html#lint-and-formatting)) - [ ] I've included any doc changes needed for https://docs.ray.io/en/master/. - [ ] I've added any new APIs to the API Reference. For example, if I added a method in Tune, I've added it in `doc/source/tune/api/` under the corresponding `.rst` file. - [ ] I've made sure the tests are passing. Note that there might be a few flaky tests, see the recent failures at https://flakey-tests.ray.io/ - Testing Strategy - [ ] Unit tests - [ ] Release tests - [ ] This PR is not tested :( --------- Signed-off-by: You-Cheng Lin (Owen) <mses010108@gmail.com> Signed-off-by: You-Cheng Lin <106612301+owenowenisme@users.noreply.github.com> Co-authored-by: Balaji Veeramani <balaji@anyscale.com>
diff --git a/doc/source/data/api/aggregate.rst b/doc/source/data/api/aggregate.rst
@@ -27,3 +27,5 @@ compute aggregations.
     Unique
     MissingValuePercentage
     ZeroPercentage
+    ApproximateQuantile
+    
diff --git a/python/ray/data/aggregate.py b/python/ray/data/aggregate.py
@@ -1189,3 +1189,101 @@ def finalize(self, accumulator: List[int]) -> Optional[float]:
         if accumulator[1] == 0:
             return None
         return (accumulator[0] / accumulator[1]) * 100.0
+
+
+@PublicAPI(stability="alpha")
+class ApproximateQuantile(AggregateFnV2):
+    def _require_datasketches(self):
+        try:
+            from datasketches import kll_floats_sketch  # type: ignore[import]
+        except ImportError as exc:
+            raise ImportError(
+                "ApproximateQuantile requires the `datasketches` package. "
+                "Install it with `pip install datasketches`."
+            ) from exc
+        return kll_floats_sketch
+
+    def __init__(
+        self,
+        on: str,
+        quantiles: List[float],
+        quantile_precision: int = 800,
+        alias_name: Optional[str] = None,
+    ):
+        """
+        Computes the approximate quantiles of a column by using a datasketches kll_floats_sketch.
+        https://datasketches.apache.org/docs/KLL/KLLSketch.html
+
+        The accuracy of the KLL quantile sketch is a function of the configured quantile precision, which also affects
+        the overall size of the sketch.
+        The KLL Sketch has absolute error. For example, a specified rank accuracy of 1% at the
+        median (rank = 0.50) means that the true quantile (if you could extract it from the set)
+        should be between getQuantile(0.49) and getQuantile(0.51). This same 1% error applied at a
+        rank of 0.95 means that the true quantile should be between getQuantile(0.94) and getQuantile(0.96).
+        In other words, the error is a fixed +/- epsilon for the entire range of ranks.
+
+        Typical single-sided rank error by quantile_precision (use for getQuantile/getRank):
+            - quantile_precision=100 → ~2.61%
+            - quantile_precision=200 → ~1.33%
+            - quantile_precision=400 → ~0.68%
+            - quantile_precision=800 → ~0.35%
+
+        See https://datasketches.apache.org/docs/KLL/KLLAccuracyAndSize.html for details on accuracy and size.
+
+        Null values in the target column are ignored when constructing the sketch.
+
+        Example:
+
+            .. testcode::
+
+                import ray
+                from ray.data.aggregate import ApproximateQuantile
+
+                # Create a dataset with some values
+                ds = ray.data.from_items(
+                    [{"value": 20.0}, {"value": 40.0}, {"value": 60.0},
+                    {"value": 80.0}, {"value": 100.0}]
+                )
+
+                result = ds.aggregate(ApproximateQuantile(on="value", quantiles=[0.1, 0.5, 0.9]))
+                # Result: {'approx_quantile(value)': [20.0, 60.0, 100.0]}
+
+
+        Args:
+            on: The name of the column to calculate the quantile on. Must be a numeric column.
+            quantiles: The list of quantiles to compute. Must be between 0 and 1 inclusive. For example, quantiles=[0.5] computes the median. Null entries in the source column are skipped.
+            quantile_precision: Controls the accuracy and memory footprint of the sketch (K in KLL); higher values yield lower error but use more memory. Defaults to 800. See https://datasketches.apache.org/docs/KLL/KLLAccuracyAndSize.html for details on accuracy and size.
+            alias_name: Optional name for the resulting column. If not provided, defaults to "approx_quantile({column_name})".
+        """
+        self._sketch_cls = self._require_datasketches()
+        self._quantiles = quantiles
+        self._quantile_precision = quantile_precision
+        super().__init__(
+            alias_name if alias_name else f"approx_quantile({str(on)})",
+            on=on,
+            ignore_nulls=True,
+            zero_factory=lambda: self.zero(quantile_precision).serialize(),
+        )
+
+    def zero(self, quantile_precision: int):
+        return self._sketch_cls(k=quantile_precision)
+
+    def aggregate_block(self, block: Block) -> bytes:
+        block_acc = BlockAccessor.for_block(block)
+        table = block_acc.to_arrow()
+        column = table.column(self.get_target_column())
+        sketch = self.zero(self._quantile_precision)
+        for value in column:
+            # we ignore nulls here
+            if value.as_py() is not None:
+                sketch.update(float(value.as_py()))
+        return sketch.serialize()
+
+    def combine(self, current_accumulator: bytes, new: bytes) -> bytes:
+        combined = self.zero(self._quantile_precision)
+        combined.merge(self._sketch_cls.deserialize(current_accumulator))
+        combined.merge(self._sketch_cls.deserialize(new))
+        return combined.serialize()
+
+    def finalize(self, accumulator: bytes) -> List[float]:
+        return self._sketch_cls.deserialize(accumulator).get_quantiles(self._quantiles)
diff --git a/python/ray/data/stats.py b/python/ray/data/stats.py
@@ -6,6 +6,7 @@
 
 from ray.data.aggregate import (
     AggregateFnV2,
+    ApproximateQuantile,
     Count,
     Max,
     Mean,
@@ -31,6 +32,7 @@ def numerical_aggregators(column: str) -> List[AggregateFnV2]:
     - min
     - max
     - std
+    - approximate_quantile
     - missing_value_percentage
     - zero_percentage
 
@@ -46,6 +48,7 @@ def numerical_aggregators(column: str) -> List[AggregateFnV2]:
         Min(on=column, ignore_nulls=True),
         Max(on=column, ignore_nulls=True),
         Std(on=column, ignore_nulls=True, ddof=0),
+        ApproximateQuantile(on=column, quantiles=[0.5]),
         MissingValuePercentage(on=column),
         ZeroPercentage(on=column, ignore_nulls=True),
     ]
diff --git a/python/ray/data/tests/test_custom_agg.py b/python/ray/data/tests/test_custom_agg.py
@@ -2,7 +2,11 @@
 import pytest
 
 import ray
-from ray.data.aggregate import MissingValuePercentage, ZeroPercentage
+from ray.data.aggregate import (
+    ApproximateQuantile,
+    MissingValuePercentage,
+    ZeroPercentage,
+)
 from ray.data.tests.conftest import *  # noqa
 from ray.tests.conftest import *  # noqa
 
@@ -276,6 +280,87 @@ def test_zero_percentage_negative_values(self, ray_start_regular_shared_2_cpus):
         assert result["zero_pct(value)"] == expected
 
 
+class TestApproximateQuantile:
+    """Test cases for ApproximateQuantile aggregation."""
+
+    def test_approximate_quantile_basic(self, ray_start_regular_shared_2_cpus):
+        """Test basic approximate quantile calculation."""
+        data = [
+            {
+                "id": 1,
+                "value": 10,
+            },
+            {"id": 2, "value": 0},
+            {"id": 3, "value": 30},
+            {"id": 4, "value": 0},
+            {"id": 5, "value": 50},
+        ]
+        ds = ray.data.from_items(data)
+
+        result = ds.aggregate(
+            ApproximateQuantile(on="value", quantiles=[0.1, 0.5, 0.9])
+        )
+        expected = [0.0, 10.0, 50.0]
+        assert result["approx_quantile(value)"] == expected
+
+    def test_approximate_quantile_ignores_nulls(self, ray_start_regular_shared_2_cpus):
+        data = [
+            {"id": 1, "value": 5.0},
+            {"id": 2, "value": None},
+            {"id": 3, "value": 15.0},
+            {"id": 4, "value": None},
+            {"id": 5, "value": 25.0},
+        ]
+        ds = ray.data.from_items(data)
+
+        result = ds.aggregate(ApproximateQuantile(on="value", quantiles=[0.5]))
+        assert result["approx_quantile(value)"] == [15.0]
+
+    def test_approximate_quantile_custom_alias(self, ray_start_regular_shared_2_cpus):
+        data = [
+            {"id": 1, "value": 1.0},
+            {"id": 2, "value": 3.0},
+            {"id": 3, "value": 5.0},
+            {"id": 4, "value": 7.0},
+            {"id": 5, "value": 9.0},
+        ]
+        ds = ray.data.from_items(data)
+
+        quantiles = [0.0, 1.0]
+        result = ds.aggregate(
+            ApproximateQuantile(
+                on="value", quantiles=quantiles, alias_name="value_range"
+            )
+        )
+
+        assert result["value_range"] == [1.0, 9.0]
+        assert len(result["value_range"]) == len(quantiles)
+
+    def test_approximate_quantile_groupby(self, ray_start_regular_shared_2_cpus):
+        data = [
+            {"group": "A", "value": 1.0},
+            {"group": "A", "value": 2.0},
+            {"group": "A", "value": 3.0},
+            {"group": "B", "value": 10.0},
+            {"group": "B", "value": 20.0},
+            {"group": "B", "value": 30.0},
+        ]
+        ds = ray.data.from_items(data)
+
+        result = (
+            ds.groupby("group")
+            .aggregate(ApproximateQuantile(on="value", quantiles=[0.5]))
+            .take_all()
+        )
+
+        result_by_group = {
+            row["group"]: row["approx_quantile(value)"] for row in result
+        }
+
+        assert result_by_group["A"] == [2.0]
+        assert result_by_group["B"] == [20.0]
+
+
 if __name__ == "__main__":
     import sys
 
diff --git a/python/ray/data/tests/test_dataset_stats.py b/python/ray/data/tests/test_dataset_stats.py
@@ -4,6 +4,7 @@
 
 import ray
 from ray.data.aggregate import (
+    ApproximateQuantile,
     Count,
     Max,
     Mean,
@@ -51,8 +52,8 @@ def test_numerical_columns_detection(self):
         assert len(feature_aggs.vector_columns) == 0
 
         # Check that we have the right number of aggregators
-        # 3 numerical columns * 7 aggregators each + 1 string column * 2 aggregators = 23 total
-        assert len(feature_aggs.aggregators) == 23
+        # 3 numerical columns * 8 aggregators each + 1 string column * 2 aggregators = 26 total
+        assert len(feature_aggs.aggregators) == 26
 
     def test_categorical_columns_detection(self):
         """Test that string columns are correctly identified as categorical."""
@@ -74,8 +75,8 @@ def test_categorical_columns_detection(self):
         assert "value" in feature_aggs.numerical_columns
         assert "category" not in feature_aggs.numerical_columns
 
-        # Check aggregator count: 1 numerical * 7 + 2 categorical * 2 = 11
-        assert len(feature_aggs.aggregators) == 11
+        # Check aggregator count: 1 numerical * 8 + 2 categorical * 2 = 12
+        assert len(feature_aggs.aggregators) == 12
 
     def test_vector_columns_detection(self):
         """Test that list columns are correctly identified as vector columns."""
@@ -97,8 +98,8 @@ def test_vector_columns_detection(self):
         assert "scalar" in feature_aggs.numerical_columns
         assert "text" in feature_aggs.str_columns
 
-        # Check aggregator count: 1 numerical * 7 + 1 categorical * 2 + 1 vector * 2 = 11
-        assert len(feature_aggs.aggregators) == 11
+        # Check aggregator count: 1 numerical * 8 + 1 categorical * 2 + 1 vector * 2 = 12
+        assert len(feature_aggs.aggregators) == 12
 
     def test_mixed_column_types(self):
         """Test dataset with all column types mixed together."""
@@ -130,8 +131,8 @@ def test_mixed_column_types(self):
         # bool_val should be treated as numerical (integer-like)
         assert "bool_val" in feature_aggs.numerical_columns
 
-        # Check aggregator count: 3 numerical * 7 + 1 categorical * 2 + 1 vector * 2 = 25
-        assert len(feature_aggs.aggregators) == 25
+        # Check aggregator count: 3 numerical * 8 + 1 categorical * 2 + 1 vector * 2 = 28
+        assert len(feature_aggs.aggregators) == 28
 
     def test_column_filtering(self):
         """Test that only specified columns are included when columns parameter is provided."""
@@ -151,8 +152,8 @@ def test_column_filtering(self):
         assert "col3" in feature_aggs.vector_columns
         assert "col4" not in feature_aggs.numerical_columns
 
-        # Check aggregator count: 1 numerical * 7 + 1 vector * 2 = 9
-        assert len(feature_aggs.aggregators) == 9
+        # Check aggregator count: 1 numerical * 8 + 1 vector * 2 = 10
+        assert len(feature_aggs.aggregators) == 10
 
     def test_empty_dataset_schema(self):
         """Test behavior with empty dataset that has no schema."""
@@ -199,8 +200,8 @@ def test_unsupported_column_types(self):
         assert "unsupported_binary" not in feature_aggs.str_columns
         assert "unsupported_binary" not in feature_aggs.vector_columns
 
-        # Check aggregator count: 1 numerical * 7 + 1 categorical * 2 = 9
-        assert len(feature_aggs.aggregators) == 9
+        # Check aggregator count: 1 numerical * 8 + 1 categorical * 2 = 10
+        assert len(feature_aggs.aggregators) == 10
 
     def test_aggregator_types_verification(self):
         """Test that the correct aggregator types are generated for each column type."""
@@ -215,16 +216,17 @@ def test_aggregator_types_verification(self):
         # Check that we have the right types of aggregators
         agg_names = [agg.name for agg in feature_aggs.aggregators]
 
-        # Numerical aggregators should include all 7 types
+        # Numerical aggregators should include all 8 types
         num_agg_names = [name for name in agg_names if "num" in name]
-        assert len(num_agg_names) == 7
+        assert len(num_agg_names) == 8
         assert any("count" in name.lower() for name in num_agg_names)
         assert any("mean" in name.lower() for name in num_agg_names)
         assert any("min" in name.lower() for name in num_agg_names)
         assert any("max" in name.lower() for name in num_agg_names)
         assert any("std" in name.lower() for name in num_agg_names)
         assert any("missing" in name.lower() for name in num_agg_names)
         assert any("zero" in name.lower() for name in num_agg_names)
+        assert any("approx_quantile" in name.lower() for name in num_agg_names)
 
         # Categorical aggregators should include count and missing percentage
         cat_agg_names = [name for name in agg_names if "cat" in name]
@@ -246,7 +248,7 @@ def test_aggregator_instances_verification(self):
 
         # Find aggregators for the numerical column
         num_aggs = [agg for agg in feature_aggs.aggregators if "num" in agg.name]
-        assert len(num_aggs) == 7
+        assert len(num_aggs) == 8
 
         # Check that we have the right aggregator types
         agg_types = [type(agg) for agg in num_aggs]
@@ -257,6 +259,7 @@ def test_aggregator_instances_verification(self):
         assert Std in agg_types
         assert MissingValuePercentage in agg_types
         assert ZeroPercentage in agg_types
+        assert ApproximateQuantile in agg_types
 
         # Find aggregators for the categorical column
         cat_aggs = [agg for agg in feature_aggs.aggregators if "cat" in agg.name]
@@ -352,8 +355,8 @@ def test_large_dataset_performance(self):
         assert "category" in feature_aggs.str_columns
         assert "vector" in feature_aggs.vector_columns
 
-        # Check aggregator count: 2 numerical * 7 + 1 categorical * 2 + 1 vector * 2 = 18
-        assert len(feature_aggs.aggregators) == 18
+        # Check aggregator count: 2 numerical * 8 + 1 categorical * 2 + 1 vector * 2 = 20
+        assert len(feature_aggs.aggregators) == 20
 
 
 class TestIndividualAggregatorFunctions:
@@ -363,7 +366,7 @@ def test_numerical_aggregators(self):
         """Test numerical_aggregators function."""
         aggs = numerical_aggregators("test_column")
 
-        assert len(aggs) == 7
+        assert len(aggs) == 8
         assert all(hasattr(agg, "get_target_column") for agg in aggs)
         assert all(agg.get_target_column() == "test_column" for agg in aggs)
 
diff --git a/python/requirements/ml/data-test-requirements.txt b/python/requirements/ml/data-test-requirements.txt
@@ -23,3 +23,4 @@ pyiceberg[sql-sqlite]==0.9.0
 clickhouse-connect
 pybase64
 hudi==0.4.0
+datasketches
diff --git a/python/requirements_compiled.txt b/python/requirements_compiled.txt
@@ -421,6 +421,8 @@ datasets==3.6.0
     #   -r python/requirements/ml/data-test-requirements.txt
     #   -r python/requirements/ml/train-requirements.txt
     #   evaluate
+datasketches==5.2.0
+    # via -r python/requirements/ml/data-test-requirements.txt
 debugpy==1.8.0
     # via ipykernel
 decorator==5.1.1
@@ -1247,6 +1249,7 @@ numpy==1.26.4
     #   cupy-cuda12x
     #   dask
     #   datasets
+    #   datasketches
     #   decord
     #   deepspeed
     #   dm-control