Skip to content

Commit da9905e

Browse files
nrghoshpeterxcli
authored andcommitted
[docs][data][llm] Batch inference docs reorg + update to reflect per-stage config refactor (ray-project#59214)
Signed-off-by: Nikhil Ghosh <nikhil@anyscale.com> Signed-off-by: Nikhil G <nrghosh@users.noreply.github.com> Signed-off-by: peterxcli <peterxcli@gmail.com>
1 parent 462efbc commit da9905e

File tree

7 files changed

+262
-360
lines changed

7 files changed

+262
-360
lines changed

.vale/styles/config/vocabularies/Data/accept.txt

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -24,20 +24,25 @@ ndarray(s)?
2424
NLP
2525
[Oo]utqueue(s)?
2626
PDFs
27+
PIL
2728
[Pp]ipelined
2829
Predibase('s)?
2930
[Pp]refetch
3031
[Pp]refetching
32+
[Pp]ostprocess
3133
[Pp]reprocess
3234
[Pp]reprocessor(s)?
3335
process_file
3436
[Pp]ushdown
3537
queryable
3638
RGB
37-
runai
39+
[Rr]un[Aa][Ii]
3840
[Ss]calers
41+
SGLang
3942
Spotify('s)?
4043
TFRecord(s)?
44+
TPU(s)?
45+
[Tt]okenizer(s)?
4146
UDF(s)?
4247
VLM(s)?
4348
XGBoost

doc/source/data/api/llm.rst

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -35,3 +35,18 @@ Processor configs
3535
~HttpRequestProcessorConfig
3636
~vLLMEngineProcessorConfig
3737
~SGLangEngineProcessorConfig
38+
39+
.. _stage-configs-ref:
40+
41+
Stage configs
42+
-------------
43+
44+
.. autosummary::
45+
:nosignatures:
46+
:template: autosummary/class_without_autosummary_noinheritance.rst
47+
:toctree: doc/
48+
49+
~ChatTemplateStageConfig
50+
~TokenizerStageConfig
51+
~DetokenizeStageConfig
52+
~PrepareMultimodalStageConfig

doc/source/data/doc_code/working-with-llms/basic_llm_example.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -179,8 +179,8 @@
179179
),
180180
batch_size=32,
181181
concurrency=1,
182-
apply_chat_template=False,
183-
detokenize=False,
182+
chat_template_stage=False, # Skip chat templating for embeddings
183+
detokenize_stage=False, # Skip detokenization for embeddings
184184
)
185185

186186
# Example usage for embeddings
Lines changed: 46 additions & 46 deletions
Original file line numberDiff line numberDiff line change
@@ -1,69 +1,69 @@
11
"""
2-
Documentation example and test for classification model batch inference.
2+
Classification batch inference with Ray Data LLM.
33
4-
This example demonstrates how to use Ray Data LLM with sequence classification
5-
models like educational content classifiers.
4+
Uses sequence classification models for content classifiers and sentiment analyzers.
65
"""
76

7+
# Dependency setup
88
import subprocess
99
import sys
1010

1111
subprocess.check_call([sys.executable, "-m", "pip", "install", "--upgrade", "ray[llm]"])
12+
subprocess.check_call(
13+
[sys.executable, "-m", "pip", "install", "--upgrade", "transformers"]
14+
)
1215
subprocess.check_call([sys.executable, "-m", "pip", "install", "numpy==1.26.4"])
1316

1417

15-
def run_classification_example():
16-
# __classification_example_start__
17-
import ray
18-
from ray.data.llm import vLLMEngineProcessorConfig, build_processor
18+
# __classification_example_start__
19+
import ray
20+
from ray.data.llm import vLLMEngineProcessorConfig, build_processor
1921

20-
# Configure vLLM for a sequence classification model
21-
classification_config = vLLMEngineProcessorConfig(
22-
model_source="nvidia/nemocurator-fineweb-nemotron-4-edu-classifier",
23-
task_type="classify", # Use 'classify' for sequence classification models
24-
engine_kwargs=dict(
25-
max_model_len=512,
26-
enforce_eager=True,
27-
),
28-
batch_size=8,
29-
concurrency=1,
30-
apply_chat_template=False,
31-
detokenize=False,
32-
)
22+
# Configure vLLM for a sequence classification model
23+
classification_config = vLLMEngineProcessorConfig(
24+
model_source="nvidia/nemocurator-fineweb-nemotron-4-edu-classifier",
25+
task_type="classify", # Use 'classify' for sequence classification models
26+
engine_kwargs=dict(
27+
max_model_len=512,
28+
enforce_eager=True,
29+
),
30+
batch_size=8,
31+
concurrency=1,
32+
apply_chat_template=False,
33+
detokenize=False,
34+
)
3335

34-
classification_processor = build_processor(
35-
classification_config,
36-
preprocess=lambda row: dict(prompt=row["text"]),
37-
postprocess=lambda row: {
38-
"text": row["prompt"],
39-
# Classification models return logits in the 'embeddings' field
40-
"edu_score": float(row["embeddings"][0])
41-
if row.get("embeddings") is not None and len(row["embeddings"]) > 0
42-
else None,
43-
},
44-
)
45-
46-
# Sample texts with varying educational quality
47-
texts = [
48-
"lol that was so funny haha",
49-
"Photosynthesis converts light energy into chemical energy.",
50-
"Newton's laws describe the relationship between forces and motion.",
51-
]
52-
ds = ray.data.from_items([{"text": text} for text in texts])
53-
54-
classified_ds = classification_processor(ds)
55-
classified_ds.show(limit=3)
56-
# __classification_example_end__
36+
classification_processor = build_processor(
37+
classification_config,
38+
preprocess=lambda row: dict(prompt=row["text"]),
39+
postprocess=lambda row: {
40+
"text": row["prompt"],
41+
# Classification models return logits in the 'embeddings' field
42+
"edu_score": float(row["embeddings"][0])
43+
if row.get("embeddings") is not None and len(row["embeddings"]) > 0
44+
else None,
45+
},
46+
)
5747

48+
# Sample texts with varying educational quality
49+
texts = [
50+
"lol that was so funny haha",
51+
"Photosynthesis converts light energy into chemical energy.",
52+
"Newton's laws describe the relationship between forces and motion.",
53+
]
54+
ds = ray.data.from_items([{"text": text} for text in texts])
5855

5956
if __name__ == "__main__":
6057
try:
6158
import torch
6259

6360
if torch.cuda.is_available():
64-
run_classification_example()
61+
classified_ds = classification_processor(ds)
62+
classified_ds.show(limit=3)
6563
else:
66-
print("Skipping classification example (no GPU available)")
64+
print("Skipping classification run (no GPU available)")
6765
except Exception as e:
68-
print(f"Skipping classification example: {e}")
66+
print(f"Skipping classification run due to environment error: {e}")
67+
# __classification_example_end__
68+
6969

doc/source/data/doc_code/working-with-llms/embedding_example.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -26,8 +26,8 @@ def run_embedding_example():
2626
),
2727
batch_size=32,
2828
concurrency=1,
29-
apply_chat_template=False,
30-
detokenize=False,
29+
chat_template_stage=False, # Skip chat templating for embeddings
30+
detokenize_stage=False, # Skip detokenization for embeddings
3131
)
3232

3333
embedding_processor = build_processor(

doc/source/data/doc_code/working-with-llms/vlm_image_example.py

Lines changed: 0 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -62,10 +62,8 @@
6262
trust_remote_code=True,
6363
limit_mm_per_prompt={"image": 1},
6464
),
65-
# Override Ray's runtime env to include the Hugging Face token. Ray Data uses Ray under the hood to orchestrate the inference pipeline.
6665
runtime_env=dict(
6766
env_vars=dict(
68-
# HF_TOKEN=HF_TOKEN, # Token not needed for public models
6967
VLLM_USE_V1="1",
7068
),
7169
),
@@ -186,9 +184,6 @@ def create_vlm_config():
186184
trust_remote_code=True,
187185
limit_mm_per_prompt={"image": 1},
188186
),
189-
runtime_env={
190-
# "env_vars": {"HF_TOKEN": "your-hf-token-here"} # Token not needed for public models
191-
},
192187
batch_size=1,
193188
accelerator_type="L4",
194189
concurrency=1,

0 commit comments

Comments
 (0)