ray-project
diff --git a/‎python/ray/data/llm.py‎
Lines changed: 15 additions & 13 deletions b/‎python/ray/data/llm.py‎
Lines changed: 15 additions & 13 deletions
diff --git a/‎python/ray/llm/_internal/batch/processor/serve_deployment_proc.py‎
Lines changed: 6 additions & 28 deletions b/‎python/ray/llm/_internal/batch/processor/serve_deployment_proc.py‎
Lines changed: 6 additions & 28 deletions
diff --git a/‎python/ray/llm/_internal/batch/stages/serve_deployment_stage.py‎
Lines changed: 42 additions & 35 deletions b/‎python/ray/llm/_internal/batch/stages/serve_deployment_stage.py‎
Lines changed: 42 additions & 35 deletions
diff --git a/‎python/ray/llm/_internal/batch/stages/sglang_engine_stage.py‎
Lines changed: 8 additions & 9 deletions b/‎python/ray/llm/_internal/batch/stages/sglang_engine_stage.py‎
Lines changed: 8 additions & 9 deletions
@@ -249,14 +249,12 @@ class SGLangEngineProcessorConfig(_SGLangEngineProcessorConfig):
 class ServeDeploymentProcessorConfig(_ServeDeploymentProcessorConfig):
     """The configuration for the serve deployment processor.
 
-    This processor enables sharing LLM engines across multiple processors and is compatible with
-    both vLLM and SGLang engines, depending on the underlying serve deployment.
+    This processor enables sharing serve deployments across multiple processors. This is useful
+    for sharing the same LLM engine across multiple processors.
 
     Args:
-        llm_config: The LLM config used to build the serve deployment.
-        name_prefix: The name prefix of the serve application to use.
+        deployment_name: The name of the serve deployment to use.
         app_name: The name of the serve application to use.
-        method: The method to invoke on the serve deployment.
         batch_size: The batch size to send to the serve deployment. Large batch sizes are
             likely to saturate the compute resources and could achieve higher throughput.
             On the other hand, small batch sizes are more fault-tolerant and could
@@ -296,24 +294,28 @@ class ServeDeploymentProcessorConfig(_ServeDeploymentProcessorConfig):
                 ),
             )
 
-            APP_NAME = "facebook"
+            APP_NAME = "facebook_opt_app"
+            DEPLOYMENT_NAME = "facebook_deployment"
 
-            llm_app = build_llm_deployment(llm_config, name_prefix="chat_completions")
+            llm_app = build_llm_deployment(llm_config, deployment_name=DEPLOYMENT_NAME)
             app = serve.run(llm_app, name=APP_NAME)
 
             config=ServeDeploymentProcessorConfig(
-                llm_config=llm_config,
+                deployment_name=DEPLOYMENT_NAME,
                 app_name=APP_NAME,
-                method="completions",
                 batch_size=1,
                 concurrency=1,
             )
             processor = build_llm_processor(
                 config,
-                preprocess=lambda row: CompletionRequest(
-                    model="facebook/opt-1.3b",
-                    prompt=row["prompt"],
-                    stream=False
+                preprocess=lambda row: dict(
+                    method="completions",
+                    dtype="CompletionRequest",
+                    request_kwargs=dict(
+                        model="facebook/opt-1.3b",
+                        prompt=row["prompt"],
+                        stream=False
+                    )
                 ),
                 postprocess=lambda row: dict(
                     resp=row["choices"][0]["text"],
 
@@ -1,15 +1,10 @@
 """The processor that runs serve deployment."""
 
-from typing import Optional
+from typing import Optional, Type, Any, Tuple
 from pydantic import Field
 
 import ray
 from ray.data.block import UserDefinedFunction
-from ray.llm._internal.batch.observability.usage_telemetry.usage import (
-    BatchModelTelemetry,
-    TelemetryAgent,
-    get_or_create_telemetry_agent,
-)
 from ray.llm._internal.batch.processor.base import (
     ProcessorConfig,
     Processor,
@@ -18,30 +13,24 @@
 from ray.llm._internal.batch.stages import (
     ServeDeploymentStage,
 )
-from ray.serve.llm import LLMConfig
 
 
 class ServeDeploymentProcessorConfig(ProcessorConfig):
     """The configuration for the serve deployment processor."""
 
-    # Configurations that was used to build the serve deployment
-    llm_config: LLMConfig = Field(
-        description="The LLM config to use for the serve deployment.",
+    # Configurations used to build the serve deployment
+    deployment_name: str = Field(
+        description="The name of the serve deployment to use.",
     )
     app_name: str = Field(
         description="The name of the serve application to use.",
     )
-    # Method to invoke on the serve deployment
-    method: str = Field(
-        description="The method to use for the serve deployment.",
-    )
 
 
 def build_serve_deployment_processor(
     config: ServeDeploymentProcessorConfig,
     preprocess: Optional[UserDefinedFunction] = None,
     postprocess: Optional[UserDefinedFunction] = None,
-    telemetry_agent: Optional[TelemetryAgent] = None,
 ) -> Processor:
     """
     Construct a processor that runs a serve deployment.
@@ -60,26 +49,15 @@ def build_serve_deployment_processor(
     stages = [
         ServeDeploymentStage(
             fn_constructor_kwargs=dict(
-                deployment_name=config.llm_config.deployment_name,
+                deployment_name=config.deployment_name,
                 app_name=config.app_name,
-                method=config.method,
             ),
             map_batches_kwargs=dict(
                 concurrency=config.concurrency,
             ),
         )
     ]
-    telemetry_agent = get_or_create_telemetry_agent()
-    telemetry_agent.push_telemetry_report(
-        BatchModelTelemetry(
-            processor_config_name=type(config).__name__,
-            model_architecture=config.llm_config.model_architecture,
-            batch_size=config.batch_size,
-            accelerator_type=config.llm_config.accelerator_type,
-            concurrency=config.concurrency,
-            task_type=config.method,
-        )
-    )
+    # TODO (Kourosh): Add telemetry for ServeDeploymentStage
     processor = Processor(
         config,
         stages,
 
@@ -3,17 +3,21 @@
 import logging
 import time
 import uuid
+from typing import Any, AsyncIterator, Dict, List, Type, Union, Tuple, Optional
+import asyncio
+from pydantic import BaseModel
 
 from ray.llm._internal.batch.stages.base import (
     StatefulStage,
     StatefulStageUDF,
 )
 from ray import serve
-from typing import Any, AsyncIterator, Dict, List, Type, Union, Tuple
-import asyncio
+
+# The following imports are necessary to resolve class references in the global namespace
 from ray.llm._internal.serve.configs.openai_api_models import (
     CompletionRequest,
     ChatCompletionRequest,
+    EmbeddingRequest,
 )
 
 logger = logging.getLogger(__name__)
@@ -26,7 +30,6 @@ def __init__(
         expected_input_keys: List[str],
         deployment_name: str,
         app_name: str,
-        method: str,
     ):
         """
         Initialize the ServeDeploymentStageUDF.
@@ -36,52 +39,39 @@ def __init__(
             expected_input_keys: The expected input keys of the stage.
             deployment_name: The name of the deployment.
             app_name: The name of the deployment app.
-            method: The method to call on the deployment.
         """
         super().__init__(data_column, expected_input_keys)
         self._dh = serve.get_deployment_handle(deployment_name, app_name).options(
             stream=True
         )
-        self._method = method
-        self._request_type = self._resolve_request_type()
         self.request_id = 0
 
-    def _prepare_request(self, row: Dict[str, Any]) -> Dict[str, Any]:
+    def _prepare_request(
+        self, row: Dict[str, Any]
+    ) -> Tuple[Dict[str, Any], Optional[Type[Any]], str]:
         """
         Decorate the request with metadata related to the batch.
 
         Args:
             row: The row.
 
         Returns:
-            The decorated request.
+            A tuple of (decorated_request, dtype, method_name). dtype is the class of the request object and
+            can be None if the serve deployment accepts a raw dict. method_name is the name of the method to
+            invoke on the serve deployment.
         """
+        method = row.get("method")
+        dtype = globals()[row.get("dtype")] if row.get("dtype") else None
+
+        request_kwargs = row.pop("request_kwargs")
         request = {
             "request_id": str(self.request_id),
             "idx_in_batch": row[self.IDX_IN_BATCH_COLUMN],
-            **row,
+            **request_kwargs,
         }
         self.request_id += 1
-        return request
 
-    def _resolve_request_type(self) -> Union[ChatCompletionRequest, CompletionRequest]:
-        """
-        Resolve the request type based on the method.
-
-        Returns:
-            The request type.
-        """
-        if getattr(self._dh, self._method) is None:
-            raise ValueError(
-                f"Method {self._method} is not supported by the serve deployment."
-            )
-
-        if self._method == "chat":
-            return ChatCompletionRequest
-        elif self._method == "completions":
-            return CompletionRequest
-        else:
-            raise ValueError(f"Unsupported method: {self._method}")
+        return request, dtype, method
 
     async def generate_async(
         self, row: Dict[str, Any]
@@ -95,14 +85,25 @@ async def generate_async(
         Returns:
             The response from the serve deployment.
         """
-        request = self._prepare_request(row)
-        t = time.perf_counter()
+        request, dtype, method = self._prepare_request(row)
 
-        output_data = await anext(
-            getattr(self._dh, self._method).remote(self._request_type(**request))
-        )
+        if dtype is not None:
+            request_obj = dtype(**request)
+        else:
+            request_obj = request
+
+        if getattr(self._dh, method) is None:
+            raise ValueError(f"Method {method} not found in the serve deployment.")
+
+        t = time.perf_counter()
+        output_data = await anext(getattr(self._dh, method).remote(request_obj))
         time_taken = time.perf_counter() - t
-        return request, output_data.model_dump(), time_taken
+
+        # Convert the output data to a dict if it is a Pydantic model.
+        if isinstance(output_data, BaseModel):
+            output_data = output_data.model_dump()
+
+        return request, output_data, time_taken
 
     async def udf(self, batch: List[Dict[str, Any]]) -> AsyncIterator[Dict[str, Any]]:
         """
@@ -122,7 +123,7 @@ async def udf(self, batch: List[Dict[str, Any]]) -> AsyncIterator[Dict[str, Any]
                 "request_id": request["request_id"],
                 self.IDX_IN_BATCH_COLUMN: request["idx_in_batch"],
                 "batch_uuid": batch_uuid.hex,
-                "time_taken_llm": time_taken,
+                "time_taken": time_taken,
                 **output,
             }
 
@@ -137,3 +138,9 @@ async def udf(self, batch: List[Dict[str, Any]]) -> AsyncIterator[Dict[str, Any]
 
 class ServeDeploymentStage(StatefulStage):
     fn: Type[StatefulStageUDF] = ServeDeploymentStageUDF
+
+    def get_required_input_keys(self) -> Dict[str, str]:
+        return {
+            "method": "Name of the method to invoke on the serve deployment.",
+            "request_kwargs": "The request_kwargs to construct the request to the serve deployment.",
+        }
@@ -177,24 +177,22 @@ async def _prepare_llm_request(self, row: Dict[str, Any]) -> SGLangEngineRequest
 
     async def generate_async(
         self, row: Dict[str, Any]
-    ) -> Tuple[SGLangEngineRequest, Dict[str, Any], float]:
+    ) -> Tuple[SGLangEngineRequest, Dict[str, Any]]:
         """Process a single request.
 
         Args:
             request: The request.
 
         Returns:
-            A tuple of index in batch, request output, bypassed custom fields, and time taken.
+            A tuple of index in batch, request output and bypassed custom fields.
         """
         request = await self._prepare_llm_request(row)
-        t = time.perf_counter()
 
         async with self.semaphore:
             output = await self._generate_async(request)
 
-        time_taken = time.perf_counter() - t
         output_data = SGLangOutputData.from_sglang_engine_output(output)
-        return request, output_data.model_dump(), time_taken
+        return request, output_data.model_dump()
 
     async def _generate_async(self, request: SGLangEngineRequest) -> Any:
         """Process a single request.
@@ -327,24 +325,25 @@ async def udf(self, batch: List[Dict[str, Any]]) -> AsyncIterator[Dict[str, Any]
 
         tasks = [asyncio.create_task(self.llm.generate_async(row)) for row in batch]
 
+        time_taken = -1.0
         for resp in asyncio.as_completed(tasks):
-            request, output, time_taken_llm = await resp
+            request, output = await resp
+            time_taken = time.perf_counter() - t
 
             yield {
                 **output,
                 "request_id": request.request_id,
                 self.IDX_IN_BATCH_COLUMN: request.idx_in_batch,
                 "batch_uuid": batch_uuid.hex,
-                "time_taken_llm": time_taken_llm,
+                "time_taken_llm": time_taken,
                 "params": str(request.params),
             }
 
-        batch_time_taken = time.perf_counter() - t
         logger.info(
             "[SGLang] Elapsed time for batch %s with size %d: %s",
             batch_uuid.hex,
             len(batch),
-            batch_time_taken,
+            time_taken,
         )
 
     def __del__(self):