peterxcli
diff --git a/‎python/ray/data/_internal/block_batching/iter_batches.py‎
Lines changed: 12 additions & 0 deletions b/‎python/ray/data/_internal/block_batching/iter_batches.py‎
Lines changed: 12 additions & 0 deletions
diff --git a/‎python/ray/data/_internal/execution/backpressure_policy/concurrency_cap_backpressure_policy.py‎
Lines changed: 13 additions & 14 deletions b/‎python/ray/data/_internal/execution/backpressure_policy/concurrency_cap_backpressure_policy.py‎
Lines changed: 13 additions & 14 deletions
diff --git a/‎python/ray/data/_internal/execution/backpressure_policy/downstream_capacity_backpressure_policy.py‎
Lines changed: 112 additions & 56 deletions b/‎python/ray/data/_internal/execution/backpressure_policy/downstream_capacity_backpressure_policy.py‎
Lines changed: 112 additions & 56 deletions
diff --git a/‎python/ray/data/_internal/execution/resource_manager.py‎
Lines changed: 40 additions & 0 deletions b/‎python/ray/data/_internal/execution/resource_manager.py‎
Lines changed: 40 additions & 0 deletions
diff --git a/‎python/ray/data/_internal/execution/streaming_executor.py‎
Lines changed: 5 additions & 0 deletions b/‎python/ray/data/_internal/execution/streaming_executor.py‎
Lines changed: 5 additions & 0 deletions
diff --git a/‎python/ray/data/_internal/iterator/iterator_impl.py‎
Lines changed: 13 additions & 3 deletions b/‎python/ray/data/_internal/iterator/iterator_impl.py‎
Lines changed: 13 additions & 3 deletions
@@ -96,6 +96,8 @@ class BatchIterator:
             the specified amount of formatted batches from blocks. This improves
             performance for non-CPU bound UDFs, allowing batch fetching compute and
             formatting to be overlapped with the UDF. Defaults to 1.
+        prefetch_bytes_callback: A callback to report prefetched bytes to the executor's
+            resource manager.
     """
 
     UPDATE_METRICS_INTERVAL_S: float = 5.0
@@ -116,6 +118,7 @@ def __init__(
         shuffle_seed: Optional[int] = None,
         ensure_copy: bool = False,
         prefetch_batches: int = 1,
+        prefetch_bytes_callback: Optional[Callable[[int], None]] = None,
     ):
         self._ref_bundles = ref_bundles
         self._stats = stats
@@ -129,6 +132,7 @@ def __init__(
         self._shuffle_seed = shuffle_seed
         self._ensure_copy = ensure_copy
         self._prefetch_batches = prefetch_batches
+        self._prefetch_bytes_callback = prefetch_bytes_callback
         # TODO: pass the dataset's context down instead of fetching the global context here.
         self._ctx = DataContext.get_current()
         self._eager_free = clear_block_after_read and self._ctx.eager_free
@@ -271,6 +275,10 @@ def before_epoch_start(self):
         self._yielded_first_batch = False
 
     def after_epoch_end(self):
+        # Report 0 prefetched bytes at the end of iteration.
+        if self._prefetch_bytes_callback is not None:
+            self._prefetch_bytes_callback(0)
+
         if self._stats is None:
             return
 
@@ -300,6 +308,10 @@ def yield_batch_context(self, batch: Batch):
         with self._stats.iter_user_s.timer() if self._stats else nullcontext():
             yield
 
+        # Report prefetched bytes to the executor's resource manager.
+        if self._prefetch_bytes_callback is not None and self._stats is not None:
+            self._prefetch_bytes_callback(self._stats.iter_prefetched_bytes)
+
         if self._stats is None:
             return
         now = time.time()
 
@@ -159,19 +159,18 @@ def can_add_input(self, op: "PhysicalOperator") -> bool:
             return num_tasks_running < self._concurrency_caps[op]
 
         # For this Op, if the objectstore budget (available) to total
-        # ratio is below threshold (10%), skip dynamic output queue size backpressure.
-        op_usage = self._resource_manager.get_op_usage(op)
-        op_budget = self._resource_manager.get_budget(op)
-        if op_usage is not None and op_budget is not None:
-            total_mem = op_usage.object_store_memory + op_budget.object_store_memory
-            if total_mem == 0 or (
-                op_budget.object_store_memory / total_mem
-                > self.AVAILABLE_OBJECT_STORE_BUDGET_THRESHOLD
-            ):
-                # If the objectstore budget (available) to total
-                # ratio is above threshold (10%), skip dynamic output queue size
-                # backpressure, but still enforce the configured cap.
-                return num_tasks_running < self._concurrency_caps[op]
+        # ratio is above threshold, skip dynamic output queue size backpressure.
+        available_budget_fraction = (
+            self._resource_manager.get_available_object_store_budget_fraction(op)
+        )
+        if (
+            available_budget_fraction is not None
+            and available_budget_fraction > self.AVAILABLE_OBJECT_STORE_BUDGET_THRESHOLD
+        ):
+            # If the objectstore budget (available) to total
+            # ratio is above threshold, skip dynamic output queue size
+            # backpressure, but still enforce the configured cap.
+            return num_tasks_running < self._concurrency_caps[op]
 
         # Current total queued bytes (this op + downstream)
         current_queue_size_bytes = self._resource_manager.get_mem_op_internal(
@@ -180,7 +179,7 @@ def can_add_input(self, op: "PhysicalOperator") -> bool:
 
         # Update EWMA state (level & dev) and compute effective cap. Note that
         # we don't update the EWMA state if the objectstore budget (available) vs total
-        # ratio is above threshold (10%), because the level and dev adjusts quickly.
+        # ratio is above threshold, because the level and dev adjusts quickly.
         self._update_level_and_dev(op, current_queue_size_bytes)
         effective_cap = self._effective_cap(
             op, num_tasks_running, current_queue_size_bytes
 
@@ -1,10 +1,8 @@
 import logging
-from typing import TYPE_CHECKING
+from typing import TYPE_CHECKING, Optional
 
 from .backpressure_policy import BackpressurePolicy
-from ray.data._internal.execution.operators.actor_pool_map_operator import (
-    ActorPoolMapOperator,
-)
+from ray._private.ray_constants import env_float
 from ray.data.context import DataContext
 
 if TYPE_CHECKING:
@@ -20,22 +18,17 @@
 class DownstreamCapacityBackpressurePolicy(BackpressurePolicy):
     """Backpressure policy based on downstream processing capacity.
 
-    This policy triggers backpressure when the output bundles size exceeds both:
-    1. A ratio threshold multiplied by the number of running tasks in downstream operators
-    2. An absolute threshold for the output bundles size
-
-    The policy monitors actual downstream processing capacity by tracking the number
-    of currently running tasks rather than configured parallelism. This approach
-    ensures effective backpressure even when cluster resources are insufficient or
-    scaling is slow, preventing memory pressure and maintaining pipeline stability.
-
-    Key benefits:
-    - Prevents memory bloat from unprocessed output objects
-    - Adapts to actual cluster conditions and resource availability
-    - Maintains balanced throughput across pipeline operators
-    - Reduces object spilling and unnecessary rebuilds
+    To backpressure a given operator, use queue size build up / downstream capacity ratio.
+    This ratio represents the upper limit of buffering in object store between pipeline stages
+    to optimize for throughput.
     """
 
+    # Threshold for per-Op object store budget utilization vs total
+    # (utilization / total) ratio to enable downstream capacity backpressure.
+    OBJECT_STORE_BUDGET_UTIL_THRESHOLD = env_float(
+        "RAY_DATA_DOWNSTREAM_CAPACITY_OBJECT_STORE_BUDGET_UTIL_THRESHOLD", 0.9
+    )
+
     @property
     def name(self) -> str:
         return "DownstreamCapacity"
@@ -47,50 +40,113 @@ def __init__(
         resource_manager: "ResourceManager",
     ):
         super().__init__(data_context, topology, resource_manager)
-        self._backpressure_concurrency_ratio = (
+        self._backpressure_capacity_ratio = (
             self._data_context.downstream_capacity_backpressure_ratio
         )
-        self._backpressure_max_queued_blocks = (
-            self._data_context.downstream_capacity_backpressure_max_queued_bundles
-        )
-        self._backpressure_disabled = (
-            self._backpressure_concurrency_ratio is None
-            or self._backpressure_max_queued_blocks is None
+        if self._backpressure_capacity_ratio is not None:
+            logger.debug(
+                f"DownstreamCapacityBackpressurePolicy enabled with backpressure capacity ratio: {self._backpressure_capacity_ratio}"
+            )
+
+    def _get_queue_size_bytes(self, op: "PhysicalOperator") -> int:
+        """Get the output current queue size
+        (this operator + ineligible downstream operators) in bytes for the given operator.
+        """
+        op_outputs_usage = self._topology[op].output_queue_bytes()
+        # Also account the downstream ineligible operators' memory usage.
+        op_outputs_usage += sum(
+            self._resource_manager.get_op_usage(next_op).object_store_memory
+            for next_op in self._resource_manager._get_downstream_ineligible_ops(op)
         )
+        return op_outputs_usage
 
-    def _max_concurrent_tasks(self, op: "PhysicalOperator") -> int:
-        if isinstance(op, ActorPoolMapOperator):
-            return sum(
-                [
-                    actor_pool.max_concurrent_tasks()
-                    for actor_pool in op.get_autoscaling_actor_pools()
-                ]
-            )
-        return op.num_active_tasks()
+    def _get_downstream_capacity_size_bytes(self, op: "PhysicalOperator") -> int:
+        """Get the downstream capacity size for the given operator.
 
-    def can_add_input(self, op: "PhysicalOperator") -> bool:
-        """Determine if we can add input to the operator based on downstream capacity."""
-        if self._backpressure_disabled:
-            return True
+        Downstream capacity size is the sum of the pending task inputs of the
+        downstream eligible operators.
+
+        If an output dependency is ineligible, skip it and recurse down to find
+        eligible output dependencies. If there are no output dependencies,
+        return external consumer bytes.
+        """
+        if not op.output_dependencies:
+            # No output dependencies, return external consumer bytes.
+            return self._resource_manager.get_external_consumer_bytes()
+
+        total_capacity_size_bytes = 0
         for output_dependency in op.output_dependencies:
-            total_enqueued_blocks = self._topology[
-                output_dependency
-            ].total_enqueued_input_blocks()
+            if self._resource_manager.is_op_eligible(output_dependency):
+                # Output dependency is eligible, add its pending task inputs.
+                total_capacity_size_bytes += (
+                    output_dependency.metrics.obj_store_mem_pending_task_inputs or 0
+                )
+            else:
+                # Output dependency is ineligible, recurse down to find eligible ops.
+                total_capacity_size_bytes += self._get_downstream_capacity_size_bytes(
+                    output_dependency
+                )
+        return total_capacity_size_bytes
 
-            avg_inputs_per_task = (
-                output_dependency.metrics.num_task_inputs_processed
-                / max(output_dependency.metrics.num_tasks_finished, 1)
-            )
-            outstanding_tasks = total_enqueued_blocks / max(avg_inputs_per_task, 1)
-            max_allowed_outstanding = (
-                self._max_concurrent_tasks(output_dependency)
-                * self._backpressure_concurrency_ratio
-            )
+    def _should_skip_backpressure(self, op: "PhysicalOperator") -> bool:
+        """Check if backpressure should be skipped for the operator.
+        TODO(srinathk10): Extract this to common logic to skip invoking BackpressurePolicy.
+        """
+        if self._backpressure_capacity_ratio is None:
+            # Downstream capacity backpressure is disabled.
+            return True
+        if not self._resource_manager.is_op_eligible(op):
+            # Operator is not eligible for backpressure.
+            return True
+        if self._resource_manager.is_materializing_op(op):
+            # Operator is materializing, so no need to perform backpressure.
+            return True
+        if self._resource_manager.has_materializing_downstream_op(op):
+            # Downstream operator is materializing, so can't perform backpressure
+            # based on downstream capacity which requires full materialization.
+            return True
+        return False
+
+    def _get_queue_ratio(self, op: "PhysicalOperator") -> float:
+        """Get queue/capacity ratio for the operator."""
+        queue_size_bytes = self._get_queue_size_bytes(op)
+        downstream_capacity_size_bytes = self._get_downstream_capacity_size_bytes(op)
+        if downstream_capacity_size_bytes == 0:
+            # No downstream capacity to backpressure against, so no backpressure.
+            return 0
+        return queue_size_bytes / downstream_capacity_size_bytes
 
-            if (
-                total_enqueued_blocks > self._backpressure_max_queued_blocks
-                and outstanding_tasks > max_allowed_outstanding
-            ):
-                return False
+    def _should_apply_backpressure(self, op: "PhysicalOperator") -> bool:
+        """Check if backpressure should be applied for the operator.
+
+        Returns True if backpressure should be applied, False otherwise.
+        """
+        if self._should_skip_backpressure(op):
+            return False
+
+        utilized_budget_fraction = (
+            self._resource_manager.get_utilized_object_store_budget_fraction(op)
+        )
+        if (
+            utilized_budget_fraction is not None
+            and utilized_budget_fraction <= self.OBJECT_STORE_BUDGET_UTIL_THRESHOLD
+        ):
+            # Utilized budget fraction is below threshold, so should skip backpressure.
+            return False
+
+        queue_ratio = self._get_queue_ratio(op)
+        # Apply backpressure if queue ratio exceeds the threshold.
+        return queue_ratio > self._backpressure_capacity_ratio
+
+    def can_add_input(self, op: "PhysicalOperator") -> bool:
+        """Determine if we can add input to the operator based on
+        downstream capacity.
+        """
+        return not self._should_apply_backpressure(op)
 
-        return True
+    def max_task_output_bytes_to_read(self, op: "PhysicalOperator") -> Optional[int]:
+        """Return the maximum bytes of pending task outputs can be read for
+        the given operator. None means no limit."""
+        if self._should_apply_backpressure(op):
+            return 0
+        return None
@@ -89,6 +89,12 @@ def __init__(
         # input buffers of the downstream operators.
         self._mem_op_outputs: Dict[PhysicalOperator, int] = defaultdict(int)
 
+        # Bytes buffered by external consumers (iterators) consuming Batches
+        # (including the prefetched blocks). For example,
+        # - ds.iter_batches -> one iterator
+        # - streaming_split -> multiple iterators
+        self._external_consumer_bytes: int = 0
+
         self._op_resource_allocator: Optional[
             "OpResourceAllocator"
         ] = create_resource_allocator(self, data_context)
@@ -134,6 +140,14 @@ def _warn_about_object_store_memory_if_needed(self):
                     f"ray.init() or by setting the RAY_DEFAULT_OBJECT_STORE_MEMORY_PROPORTION environment variable."
                 )
 
+    def set_external_consumer_bytes(self, num_bytes: int) -> None:
+        """Set the bytes buffered by external consumers."""
+        self._external_consumer_bytes = num_bytes
+
+    def get_external_consumer_bytes(self) -> int:
+        """Get the bytes buffered by external consumers."""
+        return self._external_consumer_bytes
+
     def _estimate_object_store_memory_usage(
         self, op: "PhysicalOperator", state: "OpState"
     ) -> int:
@@ -429,13 +443,39 @@ def get_op_outputs_object_store_usage_with_downstream(
         )
         return op_outputs_usage
 
+    def is_materializing_op(self, op: PhysicalOperator) -> bool:
+        """Check if the operator is a materializing operator."""
+        return isinstance(op, MATERIALIZING_OPERATORS)
+
     def has_materializing_downstream_op(self, op: PhysicalOperator) -> bool:
         """Check if the operator has a downstream materializing operator."""
         return any(
             isinstance(next_op, MATERIALIZING_OPERATORS)
             for next_op in op.output_dependencies
         )
 
+    def get_available_object_store_budget_fraction(
+        self, op: PhysicalOperator
+    ) -> Optional[float]:
+        """Get available object store memory budget fraction for the operator. Returns None if not available."""
+        op_usage = self.get_op_usage(op)
+        op_budget = self.get_budget(op)
+        if op_usage is None or op_budget is None:
+            return None
+        total_mem = op_usage.object_store_memory + op_budget.object_store_memory
+        if total_mem == 0:
+            return None
+        return op_budget.object_store_memory / total_mem
+
+    def get_utilized_object_store_budget_fraction(
+        self, op: PhysicalOperator
+    ) -> Optional[float]:
+        """Get utilized object store memory budget fraction for the operator. Returns None if not available."""
+        available_fraction = self.get_available_object_store_budget_fraction(op)
+        if available_fraction is None:
+            return None
+        return 1 - available_fraction
+
 
 def _get_first_pending_shuffle_op(topology: "Topology") -> int:
     for idx, op in enumerate(topology):
 
@@ -426,6 +426,11 @@ def get_stats(self):
         else:
             return self._generate_stats()
 
+    def set_external_consumer_bytes(self, num_bytes: int) -> None:
+        """Set the bytes buffered by external consumers."""
+        if self._resource_manager is not None:
+            self._resource_manager.set_external_consumer_bytes(num_bytes)
+
     def _generate_stats(self) -> DatasetStats:
         """Create a new stats object reflecting execution status so far."""
         stats = self._initial_stats or DatasetStats(metadata={}, parent=None)
 
@@ -8,6 +8,7 @@
 if TYPE_CHECKING:
     import pyarrow
 
+    from ray.data._internal.execution.streaming_executor import StreamingExecutor
     from ray.data.dataset import Dataset
 
 
@@ -23,9 +24,18 @@ def __repr__(self) -> str:
 
     def _to_ref_bundle_iterator(
         self,
-    ) -> Tuple[Iterator[RefBundle], Optional[DatasetStats], bool]:
-        ref_bundles_iterator, stats = self._base_dataset._execute_to_iterator()
-        return ref_bundles_iterator, stats, False
+    ) -> Tuple[
+        Iterator[RefBundle],
+        Optional[DatasetStats],
+        bool,
+        Optional["StreamingExecutor"],
+    ]:
+        (
+            ref_bundles_iterator,
+            stats,
+            executor,
+        ) = self._base_dataset._execute_to_iterator()
+        return ref_bundles_iterator, stats, False, executor
 
     def stats(self) -> str:
         return self._base_dataset.stats()