[Data][LLM] Add support for classification and scoring models in Ray Data LLM (ray-project#59499)

kouroshHakha · gemini-code-assist[bot] · peterxcli · commit 651739278c98 · 2026-02-25T15:56:02.000+08:00
Signed-off-by: Kourosh Hakhamaneshi &lt;kourosh@anyscale.com&gt;
Signed-off-by: kourosh hakhamaneshi &lt;31483498+kouroshHakha@users.noreply.github.com&gt;
Co-authored-by: gemini-code-assist[bot] &lt;176961590+gemini-code-assist[bot]@users.noreply.github.com&gt;
Signed-off-by: peterxcli &lt;peterxcli@gmail.com&gt;
diff --git a/doc/source/data/doc_code/working-with-llms/basic_llm_example.py b/doc/source/data/doc_code/working-with-llms/basic_llm_example.py
@@ -197,6 +197,41 @@ def create_embedding_processor():
 
 # __embedding_config_example_end__
 
+# __classification_config_example_start__
+# Sequence classification model configuration
+# Use task_type="classify" for classification models (e.g., sentiment, quality scoring)
+# Use task_type="score" for cross-encoder scoring models
+classification_config = vLLMEngineProcessorConfig(
+    model_source="nvidia/nemocurator-fineweb-nemotron-4-edu-classifier",
+    task_type="classify",
+    engine_kwargs=dict(
+        max_model_len=512,
+        enforce_eager=True,
+    ),
+    batch_size=8,
+    concurrency=1,
+    apply_chat_template=False,
+    detokenize=False,
+)
+
+
+# Example usage for classification
+def create_classification_processor():
+    return build_processor(
+        classification_config,
+        preprocess=lambda row: dict(prompt=row["text"]),
+        postprocess=lambda row: {
+            "text": row["prompt"],
+            # Classification models return logits in the 'embeddings' field
+            "score": float(row["embeddings"][0])
+            if row.get("embeddings") is not None and len(row["embeddings"]) > 0
+            else None,
+        },
+    )
+
+
+# __classification_config_example_end__
+
 # __shared_vllm_engine_config_example_start__
 import ray
 from ray import serve
diff --git a/doc/source/data/doc_code/working-with-llms/classification_example.py b/doc/source/data/doc_code/working-with-llms/classification_example.py
@@ -0,0 +1,69 @@
+"""
+Documentation example and test for classification model batch inference.
+
+This example demonstrates how to use Ray Data LLM with sequence classification
+models like educational content classifiers.
+"""
+
+import subprocess
+import sys
+
+subprocess.check_call([sys.executable, "-m", "pip", "install", "--upgrade", "ray[llm]"])
+subprocess.check_call([sys.executable, "-m", "pip", "install", "numpy==1.26.4"])
+
+
+def run_classification_example():
+    # __classification_example_start__
+    import ray
+    from ray.data.llm import vLLMEngineProcessorConfig, build_processor
+
+    # Configure vLLM for a sequence classification model
+    classification_config = vLLMEngineProcessorConfig(
+        model_source="nvidia/nemocurator-fineweb-nemotron-4-edu-classifier",
+        task_type="classify",  # Use 'classify' for sequence classification models
+        engine_kwargs=dict(
+            max_model_len=512,
+            enforce_eager=True,
+        ),
+        batch_size=8,
+        concurrency=1,
+        apply_chat_template=False,
+        detokenize=False,
+    )
+
+    classification_processor = build_processor(
+        classification_config,
+        preprocess=lambda row: dict(prompt=row["text"]),
+        postprocess=lambda row: {
+            "text": row["prompt"],
+            # Classification models return logits in the 'embeddings' field
+            "edu_score": float(row["embeddings"][0])
+            if row.get("embeddings") is not None and len(row["embeddings"]) > 0
+            else None,
+        },
+    )
+
+    # Sample texts with varying educational quality
+    texts = [
+        "lol that was so funny haha",
+        "Photosynthesis converts light energy into chemical energy.",
+        "Newton's laws describe the relationship between forces and motion.",
+    ]
+    ds = ray.data.from_items([{"text": text} for text in texts])
+
+    classified_ds = classification_processor(ds)
+    classified_ds.show(limit=3)
+    # __classification_example_end__
+
+
+if __name__ == "__main__":
+    try:
+        import torch
+
+        if torch.cuda.is_available():
+            run_classification_example()
+        else:
+            print("Skipping classification example (no GPU available)")
+    except Exception as e:
+        print(f"Skipping classification example: {e}")
+
diff --git a/doc/source/data/working-with-llms.rst b/doc/source/data/working-with-llms.rst
@@ -11,6 +11,7 @@ This guide shows you how to use :ref:`ray.data.llm <llm-ref>` to:
 * :ref:`Perform batch inference with LLMs <batch_inference_llm>`
 * :ref:`Configure vLLM for LLM inference <vllm_llm>`
 * :ref:`Batch inference with embedding models <embedding_models>`
+* :ref:`Batch inference with classification models <classification_models>`
 * :ref:`Query deployed models with an OpenAI compatible API endpoint <openai_compatible_api_endpoint>`
 
 .. _vllm_quickstart:
@@ -221,6 +222,39 @@ For a complete embedding configuration example, see:
     :start-after: __embedding_config_example_start__
     :end-before: __embedding_config_example_end__
 
+.. _classification_models:
+
+Batch inference with classification models
+------------------------------------------
+
+Ray Data LLM supports batch inference with sequence classification models, such as content classifiers and sentiment analyzers:
+
+.. literalinclude:: doc_code/working-with-llms/classification_example.py
+    :language: python
+    :start-after: __classification_example_start__
+    :end-before: __classification_example_end__
+
+.. testoutput::
+    :options: +MOCK
+
+    {'text': 'lol that was so funny haha', 'edu_score': -0.05}
+    {'text': 'Photosynthesis converts light energy...', 'edu_score': 1.73}
+    {'text': "Newton's laws describe...", 'edu_score': 2.52}
+
+Key differences for classification models:
+
+- Set ``task_type="classify"`` (or ``task_type="score"`` for scoring models)
+- Set ``apply_chat_template=False`` and ``detokenize=False``
+- Use direct ``prompt`` input instead of ``messages``
+- Access classification logits through ``row["embeddings"]``
+
+For a complete classification configuration example, see:
+
+.. literalinclude:: doc_code/working-with-llms/basic_llm_example.py
+    :language: python
+    :start-after: __classification_config_example_start__
+    :end-before: __classification_config_example_end__
+
 .. _openai_compatible_api_endpoint:
 
 Batch inference with an OpenAI-compatible endpoint
diff --git a/python/ray/llm/_internal/batch/stages/vllm_engine_stage.py b/python/ray/llm/_internal/batch/stages/vllm_engine_stage.py
@@ -63,6 +63,12 @@ class vLLMTaskType(str, Enum):
     """Generate embeddings."""
     EMBED = "embed"
 
+    """Classification (e.g., sequence classification models)."""
+    CLASSIFY = "classify"
+
+    """Scoring (e.g., cross-encoder models)."""
+    SCORE = "score"
+
 
 class vLLMEngineRequest(BaseModel):
     """A request to the vLLM engine."""
@@ -255,7 +261,11 @@ def __init__(
             ) from e
 
         # Construct PoolerConfig if override_pooler_config is specified.
-        if self.task_type == vLLMTaskType.EMBED and "override_pooler_config" in kwargs:
+        if (
+            self.task_type
+            in {vLLMTaskType.EMBED, vLLMTaskType.CLASSIFY, vLLMTaskType.SCORE}
+            and "override_pooler_config" in kwargs
+        ):
             kwargs["override_pooler_config"] = vllm.config.PoolerConfig(
                 **kwargs["override_pooler_config"]
             )
@@ -375,7 +385,11 @@ async def _prepare_llm_request(self, row: Dict[str, Any]) -> vLLMEngineRequest:
                 **maybe_convert_ndarray_to_list(sampling_params),
                 structured_outputs=structured_outputs,
             )
-        elif self.task_type == vLLMTaskType.EMBED:
+        elif self.task_type in (
+            vLLMTaskType.EMBED,
+            vLLMTaskType.CLASSIFY,
+            vLLMTaskType.SCORE,
+        ):
             params = vllm.PoolingParams(task=self.task_type.value)
         else:
             raise ValueError(f"Unsupported task type: {self.task_type}")
@@ -456,8 +470,12 @@ async def _generate_async(self, request: vLLMEngineRequest) -> Any:
             )
 
         # Send the request to the LLM engine.
-        # vLLM 0.12.0 uses encode() for pooling/embedding tasks, generate() for text generation
-        if self.task_type == vLLMTaskType.EMBED:
+        # vLLM 0.12.0 uses encode() for pooling/embedding/classification tasks, generate() for text generation
+        if self.task_type in (
+            vLLMTaskType.EMBED,
+            vLLMTaskType.CLASSIFY,
+            vLLMTaskType.SCORE,
+        ):
             stream = self.engine.encode(
                 request_id=str(request.request_id),
                 prompt=llm_prompt,