ray-project
diff --git a/‎ci/lint/pydoclint-baseline.txt‎
Lines changed: 0 additions & 5 deletions b/‎ci/lint/pydoclint-baseline.txt‎
Lines changed: 0 additions & 5 deletions
diff --git a/‎doc/source/serve/advanced-guides/advanced-autoscaling.md‎
Lines changed: 46 additions & 2 deletions b/‎doc/source/serve/advanced-guides/advanced-autoscaling.md‎
Lines changed: 46 additions & 2 deletions
diff --git a/‎doc/source/serve/api/index.md‎
Lines changed: 1 addition & 0 deletions b/‎doc/source/serve/api/index.md‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎doc/source/serve/doc_code/application_level_autoscaling_with_defaults.yaml‎
Lines changed: 20 additions & 0 deletions b/‎doc/source/serve/doc_code/application_level_autoscaling_with_defaults.yaml‎
Lines changed: 20 additions & 0 deletions
diff --git a/‎doc/source/serve/doc_code/autoscaling_policy.py‎
Lines changed: 45 additions & 0 deletions b/‎doc/source/serve/doc_code/autoscaling_policy.py‎
Lines changed: 45 additions & 0 deletions
diff --git a/‎python/ray/serve/_private/autoscaling_state.py‎
Lines changed: 24 additions & 7 deletions b/‎python/ray/serve/_private/autoscaling_state.py‎
Lines changed: 24 additions & 7 deletions
@@ -1609,11 +1609,6 @@ python/ray/serve/api.py
     DOC103: Function `get_deployment_handle`: Docstring arguments are different from function arguments. (Or could be other formatting issues: https://jsh9.github.io/pydoclint/violation_codes.html#notes-on-doc103 ). Arguments in the function signature but not in the docstring: [_check_exists: bool, _record_telemetry: bool].
     DOC201: Function `get_deployment_handle` does not have a return section in docstring
 --------------------
-python/ray/serve/autoscaling_policy.py
-    DOC101: Function `_calculate_desired_num_replicas`: Docstring contains fewer arguments than in function signature.
-    DOC111: Function `_calculate_desired_num_replicas`: The option `--arg-type-hints-in-docstring` is `False` but there are type hints in the docstring arg list
-    DOC103: Function `_calculate_desired_num_replicas`: Docstring arguments are different from function arguments. (Or could be other formatting issues: https://jsh9.github.io/pydoclint/violation_codes.html#notes-on-doc103 ). Arguments in the function signature but not in the docstring: [num_running_replicas: int, total_num_requests: int]. Arguments in the docstring but not in the function signature: [current_num_ongoing_requests: List[float]].
---------------------
 python/ray/serve/batching.py
     DOC111: Method `_BatchQueue.__init__`: The option `--arg-type-hints-in-docstring` is `False` but there are type hints in the docstring arg list
     DOC101: Function `batch`: Docstring contains fewer arguments than in function signature.
 
@@ -637,11 +637,41 @@ Policies are defined **per deployment**. If you don’t provide one, Ray Serve f
 
 The policy function is invoked by the Ray Serve controller every `RAY_SERVE_CONTROL_LOOP_INTERVAL_S` seconds (default **0.1s**), so your logic runs against near-real-time state.
 
+Your policy can return an `int` or a `float` for `target_replicas`. If it returns a float, Ray Serve converts it to an integer replica count by rounding up to the next greatest integer.
+
 :::{warning}
 Keep policy functions **fast and lightweight**. Slow logic can block the Serve controller and degrade cluster responsiveness.
 :::
 
 
+### Applying standard autoscaling parameters to custom policies  
+  
+Ray Serve automatically applies the following standard autoscaling parameters from your [`AutoscalingConfig`](../api/doc/ray.serve.config.AutoscalingConfig.rst) to custom policies:
+- `upscale_delay_s`, `downscale_delay_s`, `downscale_to_zero_delay_s`
+- `upscaling_factor`, `downscaling_factor`
+- `min_replicas`, `max_replicas`
+
+The following example shows a custom autoscaling policy with standard autoscaling parameters applied.
+
+```{literalinclude} ../doc_code/autoscaling_policy.py
+:language: python
+:start-after: __begin_apply_autoscaling_config_example__
+:end-before: __end_apply_autoscaling_config_example__
+```
+
+```{literalinclude} ../doc_code/autoscaling_policy.py
+:language: python
+:start-after: __begin_apply_autoscaling_config_usage__
+:end-before: __end_apply_autoscaling_config_usage__
+```
+
+::::{note}
+Your policy function should return the "raw" desired number of replicas. Ray Serve applies the `autoscaling_config` settings (delays, factors, and bounds) on top of your decision.
+
+Your policy can return an `int` or a `float` "raw desired" replica count. Ray Serve returns an integer decision number.
+::::
+
+
 ### Custom metrics
 
 You can make richer decisions by emitting your own metrics from the deployment. Implement `record_autoscaling_stats()` to return a `dict[str, float]`. Ray Serve will surface these values in the [`AutoscalingContext`](../api/doc/ray.serve.config.AutoscalingContext.rst).
@@ -681,9 +711,10 @@ By default, each deployment in Ray Serve autoscales independently. When you have
 
 An application-level autoscaling policy is a function that takes a `dict[DeploymentID, AutoscalingContext]` objects (one per deployment) and returns a tuple of `(decisions, policy_state)`. Each context contains metrics and bounds for one deployment, and the policy returns target replica counts for all deployments.
 
-The `policy_state` returned from an application-level policy must be a `dict[DeploymentID, dict]`— a dictionary mapping each deployment ID to its own state dictionary. Serve stores this per-deployment state and on the next control-loop iteration, injects each deployment's state back into that deployment's `AutoscalingContext.policy_state`. 
+The `policy_state` returned from an application-level policy must be a `Dict[DeploymentID, Dict]`— a dictionary mapping each deployment ID to its own state dictionary. Serve stores this per-deployment state and on the next control-loop iteration, injects each deployment's state back into that deployment's `AutoscalingContext.policy_state`. 
+The per deployment number replicas returned from the policy can be an `int` or a `float`. If it returns a float, Ray Serve converts it to an integer replica count by rounding up to the next greatest integer.
 
-Serve itself does not interpret the contents of `policy_state`. All the keys in each deployment's state dictionary are user-controlled.
+Serve itself does not interpret the contents of `policy_state`. All the keys in each deployment's state dictionary are user-controlled except for internal keys that are used when default parameters are applied to custom autoscaling policies.
 The following example shows a policy that scales deployments based on their relative load, ensuring that downstream deployments have enough capacity for upstream traffic:
 
 `autoscaling_policy.py` file:
@@ -728,6 +759,19 @@ Programmatic configuration of application-level autoscaling policies through `se
 When you specify both a deployment-level policy and an application-level policy, the application-level policy takes precedence. Ray Serve logs a warning if you configure both.
 :::
 
+
+#### Applying standard autoscaling parameters to application-level policies
+Ray Serve automatically applies standard autoscaling parameters (delays, factors, and min/max bounds) to application-level policies on a per-deployment basis.
+These parameters include:
+- `upscale_delay_s`, `downscale_delay_s`, `downscale_to_zero_delay_s`
+- `upscaling_factor`, `downscaling_factor`
+- `min_replicas`, `max_replicas`
+
+The YAML configuration file shows the default parameters applied to the application level policy.
+```{literalinclude} ../doc_code/application_level_autoscaling_with_defaults.yaml
+:language: yaml
+```
+Your application level policy can return per deployment desired replicas as `int` or `float` values. Ray Serve applies the autoscaling config parameters per deployment and returns integer decisions.
 :::{warning}
 ### Gotchas and limitations
 
 
@@ -86,6 +86,7 @@ See the [model composition guide](serve-model-composition) for how to update cod
    serve.config.AutoscalingConfig
    serve.config.AutoscalingPolicy
    serve.config.AutoscalingContext
+   serve.autoscaling_policy.replica_queue_length_autoscaling_policy
    serve.config.AggregationFunction
    serve.config.RequestRouterConfig
 ```
 
@@ -0,0 +1,20 @@
+applications:
+  - name: MyApp
+    import_path: application_level_autoscaling:app
+    autoscaling_policy:
+      policy_function: autoscaling_policy:coordinated_scaling_policy_with_defaults
+    deployments:
+      - name: Preprocessor
+        autoscaling_config:
+          min_replicas: 1
+          max_replicas: 10
+          target_ongoing_requests: 1
+          upscale_delay_s: 2
+          downscale_delay_s: 5
+      - name: Model
+        autoscaling_config:
+          min_replicas: 2
+          max_replicas: 20
+          target_ongoing_requests: 1
+          upscale_delay_s: 3
+          downscale_delay_s: 5
@@ -131,3 +131,48 @@ def stateful_application_level_policy(
 
 
 # __end_stateful_application_level_policy__
+# __begin_apply_autoscaling_config_example__
+from typing import Any, Dict
+from ray.serve.config import AutoscalingContext
+
+
+def queue_length_based_autoscaling_policy(
+    ctx: AutoscalingContext,
+) -> tuple[int, Dict[str, Any]]:
+    # This policy calculates the "raw" desired replicas based on queue length.
+    # Ray Serve automatically applies scaling factors, delays, and bounds from
+    # the deployment's autoscaling_config on top of this decision.
+
+    queue_length = ctx.total_num_requests
+
+    if queue_length > 50:
+        return 10, {}
+    elif queue_length > 10:
+        return 5, {}
+    else:
+        return 0, {}
+# __end_apply_autoscaling_config_example__
+
+# __begin_apply_autoscaling_config_usage__
+from ray import serve
+from ray.serve.config import AutoscalingConfig, AutoscalingPolicy
+
+@serve.deployment(
+    autoscaling_config=AutoscalingConfig(
+        min_replicas=1,
+        max_replicas=10,
+        metrics_interval_s=0.1,
+        upscale_delay_s=1.0,
+        downscale_delay_s=1.0,
+        policy=AutoscalingPolicy(
+            policy_function=queue_length_based_autoscaling_policy
+        )
+    ),
+    max_ongoing_requests=5,
+)
+class MyDeployment:
+    def __call__(self) -> str:
+        return "Hello, world!"
+
+app = MyDeployment.bind()
+# __end_apply_autoscaling_config_usage__
@@ -1,7 +1,8 @@
 import logging
+import math
 import time
 from collections import defaultdict
-from typing import Any, Callable, Dict, List, Optional, Set, Tuple
+from typing import Any, Callable, Dict, List, Optional, Set, Tuple, Union
 
 from ray.serve._private.common import (
     RUNNING_REQUESTS_KEY,
@@ -28,6 +29,10 @@
 )
 from ray.serve._private.usage import ServeUsageTag
 from ray.serve._private.utils import get_capacity_adjusted_num_replicas
+from ray.serve.autoscaling_policy import (
+    _apply_app_level_autoscaling_config,
+    _apply_autoscaling_config,
+)
 from ray.serve.config import AutoscalingContext, AutoscalingPolicy
 from ray.util import metrics
 
@@ -52,7 +57,9 @@ def __init__(self, deployment_id: DeploymentID):
         self._deployment_info = None
         self._config = None
         self._policy: Optional[
-            Callable[[AutoscalingContext], Tuple[int, Optional[Dict[str, Any]]]]
+            Callable[
+                [AutoscalingContext], Tuple[Union[int, float], Optional[Dict[str, Any]]]
+            ]
         ] = None
         # user defined policy returns a dictionary of state that is persisted between autoscaling decisions
         # content of the dictionary is determined by the user defined policy
@@ -113,7 +120,8 @@ def register(self, info: DeploymentInfo, curr_target_num_replicas: int) -> int:
 
         self._deployment_info = info
         self._config = config
-        self._policy = self._config.policy.get_policy()
+        # Apply default autoscaling config to the policy
+        self._policy = _apply_autoscaling_config(self._config.policy.get_policy())
         self._target_capacity = info.target_capacity
         self._target_capacity_direction = info.target_capacity_direction
         self._policy_state = {}
@@ -305,6 +313,9 @@ def get_decision_num_replicas(
         # Time the policy execution
         start_time = time.time()
         decision_num_replicas, self._policy_state = self._policy(autoscaling_context)
+        # The policy can return a float value.
+        if isinstance(decision_num_replicas, float):
+            decision_num_replicas = math.ceil(decision_num_replicas)
         policy_execution_time_ms = (time.time() - start_time) * 1000
 
         self.record_autoscaling_metrics(
@@ -815,7 +826,10 @@ def __init__(
         self._policy: Optional[
             Callable[
                 [Dict[DeploymentID, AutoscalingContext]],
-                Tuple[Dict[DeploymentID, int], Optional[Dict[DeploymentID, Dict]]],
+                Tuple[
+                    Dict[DeploymentID, Union[int, float]],
+                    Optional[Dict[DeploymentID, Dict]],
+                ],
             ]
         ] = None
         # user defined policy returns a dictionary of state that is persisted between autoscaling decisions
@@ -837,7 +851,10 @@ def register(
         Args:
             autoscaling_policy: The autoscaling policy to register.
         """
-        self._policy = autoscaling_policy.get_policy()
+        # Apply default autoscaling config to the policy
+        self._policy = _apply_app_level_autoscaling_config(
+            autoscaling_policy.get_policy()
+        )
         self._policy_state = {}
 
         # Log when custom autoscaling policy is used for application
@@ -974,10 +991,10 @@ def get_decision_num_replicas(
                 )
                 results[deployment_id] = (
                     self._deployment_autoscaling_states[deployment_id].apply_bounds(
-                        num_replicas
+                        math.ceil(num_replicas)
                     )
                     if not _skip_bound_check
-                    else num_replicas
+                    else math.ceil(num_replicas)
                 )
             return results
         else: