[Data] Don't downscale actors if the operator hasn't received any inputs (#59883)

bveeramani · owenowenisme · gemini-code-assist[bot] · web-flow · commit 22cf6ef6af2c · 2026-01-06T09:15:19.000Z
If you have a pipeline like `read --&gt; [some cpu transformation] --&gt; [gpu
transformation init_concurrency =N] --&gt; write`, the `gpu transformation`
might downscale to 0 actors if the CPU transformation is slow. This
basically nullifies `init_concurrency` and can cause cold-start delays.

---------

Signed-off-by: Balaji Veeramani &lt;bveeramani@berkeley.edu&gt;
Signed-off-by: You-Cheng Lin &lt;106612301+owenowenisme@users.noreply.github.com&gt;
Co-authored-by: You-Cheng Lin &lt;106612301+owenowenisme@users.noreply.github.com&gt;
Co-authored-by: gemini-code-assist[bot] &lt;176961590+gemini-code-assist[bot]@users.noreply.github.com&gt;
diff --git a/python/ray/data/_internal/actor_autoscaler/default_actor_autoscaler.py b/python/ray/data/_internal/actor_autoscaler/default_actor_autoscaler.py
@@ -122,6 +122,11 @@ def _derive_target_scaling_config(
                 reason="pool exceeding max size",
             )
 
+        # To prevent unexpected downscaling from the initial size, short-circuit if
+        # the operator hasn't received any inputs.
+        if op.metrics.num_inputs_received == 0:
+            return ActorPoolScalingRequest.no_op(reason="no inputs received")
+
         # Determine whether to scale up based on the actor pool utilization.
         util = self._compute_utilization(actor_pool)
 
diff --git a/python/ray/data/tests/test_autoscaler.py b/python/ray/data/tests/test_autoscaler.py
@@ -67,7 +67,7 @@ def test_actor_pool_scaling():
         _inputs_complete=False,
         input_dependencies=[MagicMock()],
         internal_input_queue_num_blocks=MagicMock(return_value=1),
-        metrics=MagicMock(average_num_inputs_per_task=1),
+        metrics=MagicMock(average_num_inputs_per_task=1, num_inputs_received=1),
     )
     op_state = OpState(
         op, inqueues=[MagicMock(__len__=MagicMock(return_value=10), num_blocks=10)]
@@ -217,6 +217,13 @@ def assert_autoscaling_action(
             expected_reason="exceeded resource limits",
         )
 
+    # Should no-op because the op has not received any inputs.
+    with patch(op.metrics, "num_inputs_received", 0, is_method=False):
+        assert_autoscaling_action(
+            delta=0,
+            expected_reason="no inputs received",
+        )
+
 
 @pytest.fixture
 def autoscaler_max_upscaling_delta_setup():
@@ -239,7 +246,7 @@ def autoscaler_max_upscaling_delta_setup():
         spec=InternalQueueOperatorMixin,
         has_completed=MagicMock(return_value=False),
         _inputs_complete=False,
-        metrics=MagicMock(average_num_inputs_per_task=1),
+        metrics=MagicMock(average_num_inputs_per_task=1, num_inputs_received=1),
     )
     op_state = MagicMock(
         spec=OpState,