[docs][data][llm] Batch inference docs reorg + update to reflect per-stage config refactor (ray-project#59214)

nrghosh · peterxcli · commit da9905e6af86 · 2026-02-25T16:15:57.000+08:00
Signed-off-by: Nikhil Ghosh &lt;nikhil@anyscale.com&gt;
Signed-off-by: Nikhil G &lt;nrghosh@users.noreply.github.com&gt;
Signed-off-by: peterxcli &lt;peterxcli@gmail.com&gt;
diff --git a/.vale/styles/config/vocabularies/Data/accept.txt b/.vale/styles/config/vocabularies/Data/accept.txt
@@ -24,20 +24,25 @@ ndarray(s)?
 NLP
 [Oo]utqueue(s)?
 PDFs
+PIL
 [Pp]ipelined
 Predibase('s)?
 [Pp]refetch
 [Pp]refetching
+[Pp]ostprocess
 [Pp]reprocess
 [Pp]reprocessor(s)?
 process_file
 [Pp]ushdown
 queryable
 RGB
-runai
+[Rr]un[Aa][Ii]
 [Ss]calers
+SGLang
 Spotify('s)?
 TFRecord(s)?
+TPU(s)?
+[Tt]okenizer(s)?
 UDF(s)?
 VLM(s)?
 XGBoost
diff --git a/doc/source/data/api/llm.rst b/doc/source/data/api/llm.rst
@@ -35,3 +35,18 @@ Processor configs
     ~HttpRequestProcessorConfig
     ~vLLMEngineProcessorConfig
     ~SGLangEngineProcessorConfig
+
+.. _stage-configs-ref:
+
+Stage configs
+-------------
+
+.. autosummary::
+    :nosignatures:
+    :template: autosummary/class_without_autosummary_noinheritance.rst
+    :toctree: doc/
+
+    ~ChatTemplateStageConfig
+    ~TokenizerStageConfig
+    ~DetokenizeStageConfig
+    ~PrepareMultimodalStageConfig
diff --git a/doc/source/data/doc_code/working-with-llms/basic_llm_example.py b/doc/source/data/doc_code/working-with-llms/basic_llm_example.py
@@ -179,8 +179,8 @@
     ),
     batch_size=32,
     concurrency=1,
-    apply_chat_template=False,
-    detokenize=False,
+    chat_template_stage=False,  # Skip chat templating for embeddings
+    detokenize_stage=False,     # Skip detokenization for embeddings
 )
 
 # Example usage for embeddings
diff --git a/doc/source/data/doc_code/working-with-llms/classification_example.py b/doc/source/data/doc_code/working-with-llms/classification_example.py
@@ -1,69 +1,69 @@
 """
-Documentation example and test for classification model batch inference.
+Classification batch inference with Ray Data LLM.
 
-This example demonstrates how to use Ray Data LLM with sequence classification
-models like educational content classifiers.
+Uses sequence classification models for content classifiers and sentiment analyzers.
 """
 
+# Dependency setup
 import subprocess
 import sys
 
 subprocess.check_call([sys.executable, "-m", "pip", "install", "--upgrade", "ray[llm]"])
+subprocess.check_call(
+    [sys.executable, "-m", "pip", "install", "--upgrade", "transformers"]
+)
 subprocess.check_call([sys.executable, "-m", "pip", "install", "numpy==1.26.4"])
 
 
-def run_classification_example():
-    # __classification_example_start__
-    import ray
-    from ray.data.llm import vLLMEngineProcessorConfig, build_processor
+# __classification_example_start__
+import ray
+from ray.data.llm import vLLMEngineProcessorConfig, build_processor
 
-    # Configure vLLM for a sequence classification model
-    classification_config = vLLMEngineProcessorConfig(
-        model_source="nvidia/nemocurator-fineweb-nemotron-4-edu-classifier",
-        task_type="classify",  # Use 'classify' for sequence classification models
-        engine_kwargs=dict(
-            max_model_len=512,
-            enforce_eager=True,
-        ),
-        batch_size=8,
-        concurrency=1,
-        apply_chat_template=False,
-        detokenize=False,
-    )
+# Configure vLLM for a sequence classification model
+classification_config = vLLMEngineProcessorConfig(
+    model_source="nvidia/nemocurator-fineweb-nemotron-4-edu-classifier",
+    task_type="classify",  # Use 'classify' for sequence classification models
+    engine_kwargs=dict(
+        max_model_len=512,
+        enforce_eager=True,
+    ),
+    batch_size=8,
+    concurrency=1,
+    apply_chat_template=False,
+    detokenize=False,
+)
 
-    classification_processor = build_processor(
-        classification_config,
-        preprocess=lambda row: dict(prompt=row["text"]),
-        postprocess=lambda row: {
-            "text": row["prompt"],
-            # Classification models return logits in the 'embeddings' field
-            "edu_score": float(row["embeddings"][0])
-            if row.get("embeddings") is not None and len(row["embeddings"]) > 0
-            else None,
-        },
-    )
-
-    # Sample texts with varying educational quality
-    texts = [
-        "lol that was so funny haha",
-        "Photosynthesis converts light energy into chemical energy.",
-        "Newton's laws describe the relationship between forces and motion.",
-    ]
-    ds = ray.data.from_items([{"text": text} for text in texts])
-
-    classified_ds = classification_processor(ds)
-    classified_ds.show(limit=3)
-    # __classification_example_end__
+classification_processor = build_processor(
+    classification_config,
+    preprocess=lambda row: dict(prompt=row["text"]),
+    postprocess=lambda row: {
+        "text": row["prompt"],
+        # Classification models return logits in the 'embeddings' field
+        "edu_score": float(row["embeddings"][0])
+        if row.get("embeddings") is not None and len(row["embeddings"]) > 0
+        else None,
+    },
+)
 
+# Sample texts with varying educational quality
+texts = [
+    "lol that was so funny haha",
+    "Photosynthesis converts light energy into chemical energy.",
+    "Newton's laws describe the relationship between forces and motion.",
+]
+ds = ray.data.from_items([{"text": text} for text in texts])
 
 if __name__ == "__main__":
     try:
         import torch
 
         if torch.cuda.is_available():
-            run_classification_example()
+            classified_ds = classification_processor(ds)
+            classified_ds.show(limit=3)
         else:
-            print("Skipping classification example (no GPU available)")
+            print("Skipping classification run (no GPU available)")
     except Exception as e:
-        print(f"Skipping classification example: {e}")
+        print(f"Skipping classification run due to environment error: {e}")
+# __classification_example_end__
+
 
diff --git a/doc/source/data/doc_code/working-with-llms/embedding_example.py b/doc/source/data/doc_code/working-with-llms/embedding_example.py
@@ -26,8 +26,8 @@ def run_embedding_example():
         ),
         batch_size=32,
         concurrency=1,
-        apply_chat_template=False,
-        detokenize=False,
+        chat_template_stage=False,  # Skip chat templating for embeddings
+        detokenize_stage=False,     # Skip detokenization for embeddings
     )
 
     embedding_processor = build_processor(
diff --git a/doc/source/data/doc_code/working-with-llms/vlm_image_example.py b/doc/source/data/doc_code/working-with-llms/vlm_image_example.py
@@ -62,10 +62,8 @@
         trust_remote_code=True,
         limit_mm_per_prompt={"image": 1},
     ),
-    # Override Ray's runtime env to include the Hugging Face token. Ray Data uses Ray under the hood to orchestrate the inference pipeline.
     runtime_env=dict(
         env_vars=dict(
-            # HF_TOKEN=HF_TOKEN, # Token not needed for public models
             VLLM_USE_V1="1",
         ),
     ),
@@ -186,9 +184,6 @@ def create_vlm_config():
             trust_remote_code=True,
             limit_mm_per_prompt={"image": 1},
         ),
-        runtime_env={
-            # "env_vars": {"HF_TOKEN": "your-hf-token-here"}  # Token not needed for public models
-        },
         batch_size=1,
         accelerator_type="L4",
         concurrency=1,
diff --git a/doc/source/data/working-with-llms.rst b/doc/source/data/working-with-llms.rst