sgl-project · merrymercy · Oct 13, 2025 · Oct 7, 2025 · Oct 7, 2025 · Oct 9, 2025
@@ -62,7 +62,7 @@ jobs:
 
   nightly-test-eval-vlms:
     if: github.repository == 'sgl-project/sglang'
-    runs-on: 1-gpu-runner
+    runs-on: 2-gpu-runner
     steps:
       - name: Checkout code
         uses: actions/checkout@v4
@@ -79,7 +79,7 @@ jobs:
 
   nightly-test-perf-vlms:
     if: github.repository == 'sgl-project/sglang'
-    runs-on: 1-gpu-runner
+    runs-on: 2-gpu-runner
     steps:
       - name: Checkout code
         uses: actions/checkout@v4

diff --git a/python/sglang/bench_one_batch_server.py b/python/sglang/bench_one_batch_server.py
@@ -25,8 +25,10 @@
 import numpy as np
 import requests
 from pydantic import BaseModel
+from transformers import AutoProcessor, PreTrainedTokenizer
 
 from sglang.bench_serving import (
+    get_processor,
     get_tokenizer,
     sample_mmmu_requests,
     sample_random_requests,
@@ -104,8 +106,14 @@ def get_perfetto_relay_link_from_trace_file(trace_file: str):
             if self.profile_links.extend or self.profile_links.decode:
                 # Create a combined link or use the first available one
                 trace_files = [self.profile_links.extend, self.profile_links.decode]
+                if any(trace_file is None for trace_file in trace_files):
+                    logger.error("Some trace files are None", f"{trace_files=}")
                 trace_files_relay_links = [
-                    f"[trace]({get_perfetto_relay_link_from_trace_file(trace_file)})"
+                    (
+                        f"[trace]({get_perfetto_relay_link_from_trace_file(trace_file)})"
+                        if trace_file
+                        else "N/A"
+                    )
                     for trace_file in trace_files
                 ]
 
@@ -114,30 +122,31 @@ def get_perfetto_relay_link_from_trace_file(trace_file: str):
         # Build the row
         return f"| {self.batch_size} | {self.input_len} | {self.latency:.2f} | {self.input_throughput:.2f} | {self.output_throughput:.2f} | {accept_length} | {itl:.2f} | {input_cost:.2f} | {output_cost:.2f} | {profile_link} |\n"
 
-    @classmethod
-    def generate_markdown_report(
-        cls, trace_dir, results: List["BenchmarkResult"]
-    ) -> str:
-        """Generate a markdown report from a list of BenchmarkResult object from a single run."""
-        import os
 
-        summary = f"### {results[0].model_path}\n"
+def generate_markdown_report(trace_dir, results: List["BenchmarkResult"]) -> str:
+    """Generate a markdown report from a list of BenchmarkResult object from a single run."""
+    import os
+
+    summary = f"### {results[0].model_path}\n"
 
-        # summary += (
-        #     f"Input lens: {result.input_len}. Output lens: {result.output_len}.\n"
-        # )
-        summary += "| batch size | input len | latency (s) | input throughput (tok/s)  | output throughput (tok/s) | acc length | ITL (ms) | input cost ($/1M) | output cost ($/1M) | profile (extend) | profile (decode)|\n"
-        summary += "| ---------- | --------- | ----------- | ------------------------- | ------------------------- | ---------- | -------- | ----------------- | ------------------ | --------------- | -------------- |\n"
+    # summary += (
+    #     f"Input lens: {result.input_len}. Output lens: {result.output_len}.\n"
+    # )
+    summary += "| batch size | input len | latency (s) | input throughput (tok/s)  | output throughput (tok/s) | acc length | ITL (ms) | input cost ($/1M) | output cost ($/1M) | profile (extend) | profile (decode)|\n"
+    summary += "| ---------- | --------- | ----------- | ------------------------- | ------------------------- | ---------- | -------- | ----------------- | ------------------ | --------------- | -------------- |\n"
 
-        # all results should share the same isl & osl
-        for result in results:
-            base_url = os.getenv("TRACE_BASE_URL", "").rstrip("/")
-            relay_base = os.getenv("PERFETTO_RELAY_URL", "").rstrip("/")
-            relay_base = "https://docs.sglang.ai/ci-data/pages/perfetto_relay.html"
-            # base_url = "https://github.com/sgl-project/ci-data/traces"
-            summary += result.to_markdown_row(trace_dir, base_url, relay_base)
+    # all results should share the same isl & osl
+    for result in results:
+        base_url = os.getenv(
+            "TRACE_BASE_URL", "https://github.com/sgl-project/ci-data/traces"
+        ).rstrip("/")
+        relay_base = os.getenv(
+            "PERFETTO_RELAY_URL",
+            "https://docs.sglang.ai/ci-data/pages/perfetto_relay.html",
+        ).rstrip("/")
+        summary += result.to_markdown_row(trace_dir, base_url, relay_base)
 
-        return summary
+    return summary
 
 
 @dataclasses.dataclass
@@ -288,7 +297,7 @@ def run_one_case(
     input_len_step_percentage: float,
     run_name: str,
     result_filename: str,
-    tokenizer,
+    tokenizer: PreTrainedTokenizer | AutoProcessor,
     dataset_name="",
     profile: bool = False,
     profile_steps: int = 3,
@@ -302,9 +311,8 @@ def run_one_case(
     if dataset_name == "mmmu":
         input_requests = sample_mmmu_requests(
             num_requests=batch_size,
-            tokenizer=tokenizer,
+            processor=tokenizer,
             fixed_output_len=output_len,
-            apply_chat_template=True,
             random_sample=False,
         )
     elif dataset_name == "random":
@@ -364,6 +372,8 @@ def run_one_case(
     if dataset_name == "mmmu":
         # vlm
         input_ids = []
+        # for vlms, tokenizer is an instance of AutoProcessor
+        tokenizer = tokenizer.tokenizer
         for input_req in input_requests:
             input_ids += [tokenizer.encode(input_req.prompt)]
         payload["image_data"] = [req.image_data for req in input_requests]
@@ -609,7 +619,12 @@ def run_benchmark(server_args: ServerArgs, bench_args: BenchArgs):
         tokenizer_path = server_info["tokenizer_path"]
     elif "prefill" in server_info:
         tokenizer_path = server_info["prefill"][0]["tokenizer_path"]
-    tokenizer = get_tokenizer(tokenizer_path)
+
+    if bench_args.dataset_name == "mmmu":
+        # mmmu implies this is a MLLM
+        tokenizer = get_processor(tokenizer_path)
+    else:
+        tokenizer = get_tokenizer(tokenizer_path)
 
     # warmup
     if not bench_args.skip_warmup:

diff --git a/python/sglang/bench_serving.py b/python/sglang/bench_serving.py
@@ -12,7 +12,6 @@
 
 import argparse
 import asyncio
-import base64
 import io
 import json
 import os
@@ -671,7 +670,7 @@ def get_processor(
     if pretrained_model_name_or_path.endswith(
         ".json"
     ) or pretrained_model_name_or_path.endswith(".model"):
-        from sglang.srt.hf_transformers_utils import get_processor
+        from sglang.srt.utils.hf_transformers_utils import get_processor
 
         return get_processor(pretrained_model_name_or_path)
 
@@ -935,7 +934,7 @@ async def get_mooncake_request_over_time(
         for i in range(num_rounds):
             # Add user query for the current round
             chat_history.append(
-                {"role": "user", "content": f"Round {i+1}: {user_query_base}"}
+                {"role": "user", "content": f"Round {i + 1}: {user_query_base}"}
             )
 
             # Form the full prompt from history
@@ -964,7 +963,7 @@ async def get_mooncake_request_over_time(
 
 def sample_mmmu_requests(
     num_requests: int,
-    processor: AutoProcessor,
+    processor: AutoProcessor | AutoTokenizer,
     fixed_output_len: Optional[int] = None,
     random_sample: bool = True,
 ) -> List[DatasetRow]:
@@ -973,9 +972,7 @@ def sample_mmmu_requests(
 
     Args:
         num_requests: Number of requests to sample.
-        tokenizer: Tokenizer to use for token counting.
         fixed_output_len: If provided, use this fixed output length for all requests.
-        apply_chat_template: Whether to apply the chat template to the prompt.
         random_sample: Whether to randomly sample or take the first N.
 
     Returns:
@@ -1282,19 +1279,21 @@ def parse_image_resolution(image_resolution: str) -> Tuple[int, int]:
     )
 
 
-def create_mm_data_row(text_prompt, images, images_base64, output_len, processor):
+def create_mm_data_row(text_prompt, images: list, images_base64, output_len, processor):
     try:
         content_items = [
-            {"type": "image_url", "image_url": {"url": img_url}}
-            for img_url in images_base64
+            {"type": "image", "image": {"url": image_base64}}
+            for image_base64 in images_base64
         ]
         content_items.append({"type": "text", "text": text_prompt})
         prompt_str = processor.apply_chat_template(
             [{"role": "user", "content": content_items}],
             add_generation_prompt=True,
             tokenize=False,
         )
-    except Exception:
+    except Exception as e:
+        # Note (Xinyuan): This is a workaround for an issue where some tokenizers do not support content as a list. (e.g. InternVL)
+        print(f"Error applying chat template: {e}, fallback to <image> tag")
         # Some tokenizers do not support list content; fall back to a placeholder in the text
         prompt_str = f"<image>{text_prompt}"
 
@@ -1425,7 +1424,7 @@ def _gen_random_image_data_uri(
     print(f"#Input tokens: {np.sum([x.prompt_len for x in dataset])}")
     print(f"#Output tokens: {np.sum([x.output_len for x in dataset])}")
     print(
-        f"\nCreated {len(dataset)} {image_content} {image_format} images with average {total_image_bytes//num_requests} bytes per request"
+        f"\nCreated {len(dataset)} {image_content} {image_format} images with average {total_image_bytes // num_requests} bytes per request"
     )
     return dataset
 

diff --git a/test/srt/test_nightly_text_models_perf.py b/test/srt/test_nightly_text_models_perf.py
@@ -3,7 +3,7 @@
 import time
 import unittest
 
-from sglang.bench_one_batch_server import BenchmarkResult
+from sglang.bench_one_batch_server import BenchmarkResult, generate_markdown_report
 from sglang.srt.utils import kill_process_tree
 from sglang.test.test_utils import (
     DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
@@ -41,7 +41,7 @@ def setUpClass(cls):
 
     def test_bench_one_batch(self):
         all_benchmark_results = []
-
+        all_model_succeed = True
         for model_setup in self.models:
             benchmark_results = []
             with self.subTest(model=model_setup.model_path):
@@ -113,19 +113,21 @@ def test_bench_one_batch(self):
                         # Clean up JSON file
                         os.remove(json_output_file)
                     else:
+                        all_model_succeed = False
                         print(f"Warning: JSON output file {json_output_file} not found")
 
                 finally:
                     kill_process_tree(process.pid)
 
-                report_part = BenchmarkResult.generate_markdown_report(
-                    PROFILE_DIR, benchmark_results
-                )
+                report_part = generate_markdown_report(PROFILE_DIR, benchmark_results)
                 self.full_report += report_part + "\n"
 
         if is_in_ci():
             write_github_step_summary(self.full_report)
 
+        if not all_model_succeed:
+            raise AssertionError("Some models failed the perf tests.")
+
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/test/srt/test_nightly_vlms_mmmu_eval.py b/test/srt/test_nightly_vlms_mmmu_eval.py
@@ -1,7 +1,6 @@
 import json
 import unittest
 import warnings
-from functools import partial
 from types import SimpleNamespace
 
 from sglang.srt.utils import kill_process_tree
@@ -26,16 +25,19 @@
         "Efficient-Large-Model/NVILA-Lite-2B-hf-0626"
     ): ModelEvalMetrics(0.305, 23.8),
     ModelLaunchSettings("google/gemma-3-4b-it"): ModelEvalMetrics(0.360, 10.9),
-    ModelLaunchSettings("google/gemma-3n-E4B-it"): ModelEvalMetrics(0.360, 15.3),
+    ModelLaunchSettings("google/gemma-3n-E4B-it"): ModelEvalMetrics(0.360, 17.7),
     ModelLaunchSettings("mistral-community/pixtral-12b"): ModelEvalMetrics(0.360, 16.6),
     ModelLaunchSettings("moonshotai/Kimi-VL-A3B-Instruct"): ModelEvalMetrics(
         0.330, 22.3
     ),
     ModelLaunchSettings("openbmb/MiniCPM-o-2_6"): ModelEvalMetrics(0.330, 29.3),
-    ModelLaunchSettings("openbmb/MiniCPM-v-2_6"): ModelEvalMetrics(0.270, 24.5),
-    ModelLaunchSettings("OpenGVLab/InternVL2_5-2B"): ModelEvalMetrics(0.300, 14.0),
+    ModelLaunchSettings("openbmb/MiniCPM-v-2_6"): ModelEvalMetrics(0.259, 36.3),
+    ModelLaunchSettings("OpenGVLab/InternVL2_5-2B"): ModelEvalMetrics(0.300, 17.0),
     ModelLaunchSettings("Qwen/Qwen2-VL-7B-Instruct"): ModelEvalMetrics(0.310, 83.3),
     ModelLaunchSettings("Qwen/Qwen2.5-VL-7B-Instruct"): ModelEvalMetrics(0.340, 31.9),
+    ModelLaunchSettings(
+        "Qwen/Qwen3-VL-30B-A3B-Instruct", extra_args=["--tp=2"]
+    ): ModelEvalMetrics(0.29, 29.1),
     ModelLaunchSettings(
         "unsloth/Mistral-Small-3.1-24B-Instruct-2503"
     ): ModelEvalMetrics(0.310, 16.7),

diff --git a/test/srt/test_nightly_vlms_perf.py b/test/srt/test_nightly_vlms_perf.py
@@ -3,7 +3,7 @@
 import unittest
 import warnings
 
-from sglang.bench_one_batch_server import BenchmarkResult
+from sglang.bench_one_batch_server import BenchmarkResult, generate_markdown_report
 from sglang.srt.utils import kill_process_tree
 from sglang.test.test_utils import (
     DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
@@ -27,6 +27,7 @@
     ModelLaunchSettings(
         "google/gemma-3-27b-it",
     ),
+    ModelLaunchSettings("Qwen/Qwen3-VL-30B-A3B-Instruct", extra_args=["--tp=2"]),
     # "OpenGVLab/InternVL2_5-2B",
     # buggy in official transformers impl
     # "openbmb/MiniCPM-V-2_6",
@@ -45,9 +46,7 @@ def setUpClass(cls):
             cls.models = []
             model_paths = parse_models(nightly_vlm_models_str)
             for model_path in model_paths:
-                cls.models.append(
-                    ModelLaunchSettings(model_path, extra_args=VLM_EXTRA_ARGS)
-                )
+                cls.models.append(ModelLaunchSettings(model_path))
         else:
             cls.models = MODEL_DEFAULTS
 
@@ -60,6 +59,7 @@ def setUpClass(cls):
 
     def test_bench_one_batch(self):
         all_benchmark_results = []
+        all_model_succeed = True
 
         for model_setup in self.models:
             benchmark_results = []
@@ -112,7 +112,6 @@ def test_bench_one_batch(self):
                             f"Error running benchmark for {model_setup.model_path} with batch size:"
                         )
                         print(result.stderr)
-                        # Continue to next batch size even if one fails
                         continue
 
                     print(f"Output for {model_setup.model_path} with batch size:")
@@ -136,19 +135,24 @@ def test_bench_one_batch(self):
                         )
 
                     else:
+                        all_model_succeed = False
                         print(f"Warning: JSON output file {json_output_file} not found")
 
                 finally:
                     kill_process_tree(process.pid)
 
-                report_part = BenchmarkResult.generate_markdown_report(
-                    PROFILE_DIR, benchmark_results
+                report_part = generate_markdown_report(
+                    PROFILE_DIR,
+                    benchmark_results,
                 )
                 self.full_report += report_part + "\n"
 
         if is_in_ci():
             write_github_step_summary(self.full_report)
 
+        if not all_model_succeed:
+            raise AssertionError("Some models failed the perf tests.")
+
 
 if __name__ == "__main__":
     unittest.main()