Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions .github/workflows/nightly-test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,7 @@ jobs:

nightly-test-eval-vlms:
if: github.repository == 'sgl-project/sglang'
runs-on: 1-gpu-runner
runs-on: 2-gpu-runner
steps:
- name: Checkout code
uses: actions/checkout@v4
Expand All @@ -79,7 +79,7 @@ jobs:

nightly-test-perf-vlms:
if: github.repository == 'sgl-project/sglang'
runs-on: 1-gpu-runner
runs-on: 2-gpu-runner
steps:
- name: Checkout code
uses: actions/checkout@v4
Expand Down
65 changes: 40 additions & 25 deletions python/sglang/bench_one_batch_server.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,8 +25,10 @@
import numpy as np
import requests
from pydantic import BaseModel
from transformers import AutoProcessor, PreTrainedTokenizer

from sglang.bench_serving import (
get_processor,
get_tokenizer,
sample_mmmu_requests,
sample_random_requests,
Expand Down Expand Up @@ -104,8 +106,14 @@ def get_perfetto_relay_link_from_trace_file(trace_file: str):
if self.profile_links.extend or self.profile_links.decode:
# Create a combined link or use the first available one
trace_files = [self.profile_links.extend, self.profile_links.decode]
if any(trace_file is None for trace_file in trace_files):
logger.error("Some trace files are None", f"{trace_files=}")
trace_files_relay_links = [
f"[trace]({get_perfetto_relay_link_from_trace_file(trace_file)})"
(
f"[trace]({get_perfetto_relay_link_from_trace_file(trace_file)})"
if trace_file
else "N/A"
)
for trace_file in trace_files
]

Expand All @@ -114,30 +122,31 @@ def get_perfetto_relay_link_from_trace_file(trace_file: str):
# Build the row
return f"| {self.batch_size} | {self.input_len} | {self.latency:.2f} | {self.input_throughput:.2f} | {self.output_throughput:.2f} | {accept_length} | {itl:.2f} | {input_cost:.2f} | {output_cost:.2f} | {profile_link} |\n"

@classmethod
def generate_markdown_report(
cls, trace_dir, results: List["BenchmarkResult"]
) -> str:
"""Generate a markdown report from a list of BenchmarkResult object from a single run."""
import os

summary = f"### {results[0].model_path}\n"
def generate_markdown_report(trace_dir, results: List["BenchmarkResult"]) -> str:
"""Generate a markdown report from a list of BenchmarkResult object from a single run."""
import os

summary = f"### {results[0].model_path}\n"

# summary += (
# f"Input lens: {result.input_len}. Output lens: {result.output_len}.\n"
# )
summary += "| batch size | input len | latency (s) | input throughput (tok/s) | output throughput (tok/s) | acc length | ITL (ms) | input cost ($/1M) | output cost ($/1M) | profile (extend) | profile (decode)|\n"
summary += "| ---------- | --------- | ----------- | ------------------------- | ------------------------- | ---------- | -------- | ----------------- | ------------------ | --------------- | -------------- |\n"
# summary += (
# f"Input lens: {result.input_len}. Output lens: {result.output_len}.\n"
# )
summary += "| batch size | input len | latency (s) | input throughput (tok/s) | output throughput (tok/s) | acc length | ITL (ms) | input cost ($/1M) | output cost ($/1M) | profile (extend) | profile (decode)|\n"
summary += "| ---------- | --------- | ----------- | ------------------------- | ------------------------- | ---------- | -------- | ----------------- | ------------------ | --------------- | -------------- |\n"

# all results should share the same isl & osl
for result in results:
base_url = os.getenv("TRACE_BASE_URL", "").rstrip("/")
relay_base = os.getenv("PERFETTO_RELAY_URL", "").rstrip("/")
relay_base = "https://docs.sglang.ai/ci-data/pages/perfetto_relay.html"
# base_url = "https://github.com/sgl-project/ci-data/traces"
summary += result.to_markdown_row(trace_dir, base_url, relay_base)
# all results should share the same isl & osl
for result in results:
base_url = os.getenv(
"TRACE_BASE_URL", "https://github.com/sgl-project/ci-data/traces"
).rstrip("/")
relay_base = os.getenv(
"PERFETTO_RELAY_URL",
"https://docs.sglang.ai/ci-data/pages/perfetto_relay.html",
).rstrip("/")
summary += result.to_markdown_row(trace_dir, base_url, relay_base)

return summary
return summary


@dataclasses.dataclass
Expand Down Expand Up @@ -288,7 +297,7 @@ def run_one_case(
input_len_step_percentage: float,
run_name: str,
result_filename: str,
tokenizer,
tokenizer: PreTrainedTokenizer | AutoProcessor,
dataset_name="",
profile: bool = False,
profile_steps: int = 3,
Expand All @@ -302,9 +311,8 @@ def run_one_case(
if dataset_name == "mmmu":
input_requests = sample_mmmu_requests(
num_requests=batch_size,
tokenizer=tokenizer,
processor=tokenizer,
fixed_output_len=output_len,
apply_chat_template=True,
random_sample=False,
)
elif dataset_name == "random":
Expand Down Expand Up @@ -364,6 +372,8 @@ def run_one_case(
if dataset_name == "mmmu":
# vlm
input_ids = []
# for vlms, tokenizer is an instance of AutoProcessor
tokenizer = tokenizer.tokenizer
for input_req in input_requests:
input_ids += [tokenizer.encode(input_req.prompt)]
payload["image_data"] = [req.image_data for req in input_requests]
Expand Down Expand Up @@ -609,7 +619,12 @@ def run_benchmark(server_args: ServerArgs, bench_args: BenchArgs):
tokenizer_path = server_info["tokenizer_path"]
elif "prefill" in server_info:
tokenizer_path = server_info["prefill"][0]["tokenizer_path"]
tokenizer = get_tokenizer(tokenizer_path)

if bench_args.dataset_name == "mmmu":
# mmmu implies this is a MLLM
tokenizer = get_processor(tokenizer_path)
else:
tokenizer = get_tokenizer(tokenizer_path)

# warmup
if not bench_args.skip_warmup:
Expand Down
21 changes: 10 additions & 11 deletions python/sglang/bench_serving.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,6 @@

import argparse
import asyncio
import base64
import io
import json
import os
Expand Down Expand Up @@ -671,7 +670,7 @@ def get_processor(
if pretrained_model_name_or_path.endswith(
".json"
) or pretrained_model_name_or_path.endswith(".model"):
from sglang.srt.hf_transformers_utils import get_processor
from sglang.srt.utils.hf_transformers_utils import get_processor

return get_processor(pretrained_model_name_or_path)

Expand Down Expand Up @@ -935,7 +934,7 @@ async def get_mooncake_request_over_time(
for i in range(num_rounds):
# Add user query for the current round
chat_history.append(
{"role": "user", "content": f"Round {i+1}: {user_query_base}"}
{"role": "user", "content": f"Round {i + 1}: {user_query_base}"}
)

# Form the full prompt from history
Expand Down Expand Up @@ -964,7 +963,7 @@ async def get_mooncake_request_over_time(

def sample_mmmu_requests(
num_requests: int,
processor: AutoProcessor,
processor: AutoProcessor | AutoTokenizer,
fixed_output_len: Optional[int] = None,
random_sample: bool = True,
) -> List[DatasetRow]:
Expand All @@ -973,9 +972,7 @@ def sample_mmmu_requests(

Args:
num_requests: Number of requests to sample.
tokenizer: Tokenizer to use for token counting.
fixed_output_len: If provided, use this fixed output length for all requests.
apply_chat_template: Whether to apply the chat template to the prompt.
random_sample: Whether to randomly sample or take the first N.

Returns:
Expand Down Expand Up @@ -1282,19 +1279,21 @@ def parse_image_resolution(image_resolution: str) -> Tuple[int, int]:
)


def create_mm_data_row(text_prompt, images, images_base64, output_len, processor):
def create_mm_data_row(text_prompt, images: list, images_base64, output_len, processor):
try:
content_items = [
{"type": "image_url", "image_url": {"url": img_url}}
for img_url in images_base64
{"type": "image", "image": {"url": image_base64}}
for image_base64 in images_base64
]
content_items.append({"type": "text", "text": text_prompt})
prompt_str = processor.apply_chat_template(
[{"role": "user", "content": content_items}],
add_generation_prompt=True,
tokenize=False,
)
except Exception:
except Exception as e:
# Note (Xinyuan): This is a workaround for an issue where some tokenizers do not support content as a list. (e.g. InternVL)
print(f"Error applying chat template: {e}, fallback to <image> tag")
# Some tokenizers do not support list content; fall back to a placeholder in the text
prompt_str = f"<image>{text_prompt}"

Expand Down Expand Up @@ -1425,7 +1424,7 @@ def _gen_random_image_data_uri(
print(f"#Input tokens: {np.sum([x.prompt_len for x in dataset])}")
print(f"#Output tokens: {np.sum([x.output_len for x in dataset])}")
print(
f"\nCreated {len(dataset)} {image_content} {image_format} images with average {total_image_bytes//num_requests} bytes per request"
f"\nCreated {len(dataset)} {image_content} {image_format} images with average {total_image_bytes // num_requests} bytes per request"
)
return dataset

Expand Down
12 changes: 7 additions & 5 deletions test/srt/test_nightly_text_models_perf.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
import time
import unittest

from sglang.bench_one_batch_server import BenchmarkResult
from sglang.bench_one_batch_server import BenchmarkResult, generate_markdown_report
from sglang.srt.utils import kill_process_tree
from sglang.test.test_utils import (
DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
Expand Down Expand Up @@ -41,7 +41,7 @@ def setUpClass(cls):

def test_bench_one_batch(self):
all_benchmark_results = []

all_model_succeed = True
for model_setup in self.models:
benchmark_results = []
with self.subTest(model=model_setup.model_path):
Expand Down Expand Up @@ -113,19 +113,21 @@ def test_bench_one_batch(self):
# Clean up JSON file
os.remove(json_output_file)
else:
all_model_succeed = False
print(f"Warning: JSON output file {json_output_file} not found")

finally:
kill_process_tree(process.pid)

report_part = BenchmarkResult.generate_markdown_report(
PROFILE_DIR, benchmark_results
)
report_part = generate_markdown_report(PROFILE_DIR, benchmark_results)
self.full_report += report_part + "\n"

if is_in_ci():
write_github_step_summary(self.full_report)

if not all_model_succeed:
raise AssertionError("Some models failed the perf tests.")


if __name__ == "__main__":
unittest.main()
10 changes: 6 additions & 4 deletions test/srt/test_nightly_vlms_mmmu_eval.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
import json
import unittest
import warnings
from functools import partial
from types import SimpleNamespace

from sglang.srt.utils import kill_process_tree
Expand All @@ -26,16 +25,19 @@
"Efficient-Large-Model/NVILA-Lite-2B-hf-0626"
): ModelEvalMetrics(0.305, 23.8),
ModelLaunchSettings("google/gemma-3-4b-it"): ModelEvalMetrics(0.360, 10.9),
ModelLaunchSettings("google/gemma-3n-E4B-it"): ModelEvalMetrics(0.360, 15.3),
ModelLaunchSettings("google/gemma-3n-E4B-it"): ModelEvalMetrics(0.360, 17.7),
ModelLaunchSettings("mistral-community/pixtral-12b"): ModelEvalMetrics(0.360, 16.6),
ModelLaunchSettings("moonshotai/Kimi-VL-A3B-Instruct"): ModelEvalMetrics(
0.330, 22.3
),
ModelLaunchSettings("openbmb/MiniCPM-o-2_6"): ModelEvalMetrics(0.330, 29.3),
ModelLaunchSettings("openbmb/MiniCPM-v-2_6"): ModelEvalMetrics(0.270, 24.5),
ModelLaunchSettings("OpenGVLab/InternVL2_5-2B"): ModelEvalMetrics(0.300, 14.0),
ModelLaunchSettings("openbmb/MiniCPM-v-2_6"): ModelEvalMetrics(0.259, 36.3),
ModelLaunchSettings("OpenGVLab/InternVL2_5-2B"): ModelEvalMetrics(0.300, 17.0),
ModelLaunchSettings("Qwen/Qwen2-VL-7B-Instruct"): ModelEvalMetrics(0.310, 83.3),
ModelLaunchSettings("Qwen/Qwen2.5-VL-7B-Instruct"): ModelEvalMetrics(0.340, 31.9),
ModelLaunchSettings(
"Qwen/Qwen3-VL-30B-A3B-Instruct", extra_args=["--tp=2"]
): ModelEvalMetrics(0.29, 29.1),
ModelLaunchSettings(
"unsloth/Mistral-Small-3.1-24B-Instruct-2503"
): ModelEvalMetrics(0.310, 16.7),
Expand Down
18 changes: 11 additions & 7 deletions test/srt/test_nightly_vlms_perf.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
import unittest
import warnings

from sglang.bench_one_batch_server import BenchmarkResult
from sglang.bench_one_batch_server import BenchmarkResult, generate_markdown_report
from sglang.srt.utils import kill_process_tree
from sglang.test.test_utils import (
DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
Expand All @@ -27,6 +27,7 @@
ModelLaunchSettings(
"google/gemma-3-27b-it",
),
ModelLaunchSettings("Qwen/Qwen3-VL-30B-A3B-Instruct", extra_args=["--tp=2"]),
# "OpenGVLab/InternVL2_5-2B",
# buggy in official transformers impl
# "openbmb/MiniCPM-V-2_6",
Expand All @@ -45,9 +46,7 @@ def setUpClass(cls):
cls.models = []
model_paths = parse_models(nightly_vlm_models_str)
for model_path in model_paths:
cls.models.append(
ModelLaunchSettings(model_path, extra_args=VLM_EXTRA_ARGS)
)
cls.models.append(ModelLaunchSettings(model_path))
else:
cls.models = MODEL_DEFAULTS

Expand All @@ -60,6 +59,7 @@ def setUpClass(cls):

def test_bench_one_batch(self):
all_benchmark_results = []
all_model_succeed = True

for model_setup in self.models:
benchmark_results = []
Expand Down Expand Up @@ -112,7 +112,6 @@ def test_bench_one_batch(self):
f"Error running benchmark for {model_setup.model_path} with batch size:"
)
print(result.stderr)
# Continue to next batch size even if one fails
continue

print(f"Output for {model_setup.model_path} with batch size:")
Expand All @@ -136,19 +135,24 @@ def test_bench_one_batch(self):
)

else:
all_model_succeed = False
print(f"Warning: JSON output file {json_output_file} not found")

finally:
kill_process_tree(process.pid)

report_part = BenchmarkResult.generate_markdown_report(
PROFILE_DIR, benchmark_results
report_part = generate_markdown_report(
PROFILE_DIR,
benchmark_results,
)
self.full_report += report_part + "\n"

if is_in_ci():
write_github_step_summary(self.full_report)

if not all_model_succeed:
raise AssertionError("Some models failed the perf tests.")


if __name__ == "__main__":
unittest.main()
Loading