diff --git a/.github/workflows/nightly-test.yml b/.github/workflows/nightly-test.yml index 6caa1684627b..a7535fbaca5d 100644 --- a/.github/workflows/nightly-test.yml +++ b/.github/workflows/nightly-test.yml @@ -62,7 +62,7 @@ jobs: nightly-test-eval-vlms: if: github.repository == 'sgl-project/sglang' - runs-on: 1-gpu-runner + runs-on: 2-gpu-runner steps: - name: Checkout code uses: actions/checkout@v4 @@ -79,7 +79,7 @@ jobs: nightly-test-perf-vlms: if: github.repository == 'sgl-project/sglang' - runs-on: 1-gpu-runner + runs-on: 2-gpu-runner steps: - name: Checkout code uses: actions/checkout@v4 diff --git a/python/sglang/bench_one_batch_server.py b/python/sglang/bench_one_batch_server.py index 711236b3c1fe..adcad2ec4586 100644 --- a/python/sglang/bench_one_batch_server.py +++ b/python/sglang/bench_one_batch_server.py @@ -25,8 +25,10 @@ import numpy as np import requests from pydantic import BaseModel +from transformers import AutoProcessor, PreTrainedTokenizer from sglang.bench_serving import ( + get_processor, get_tokenizer, sample_mmmu_requests, sample_random_requests, @@ -104,8 +106,14 @@ def get_perfetto_relay_link_from_trace_file(trace_file: str): if self.profile_links.extend or self.profile_links.decode: # Create a combined link or use the first available one trace_files = [self.profile_links.extend, self.profile_links.decode] + if any(trace_file is None for trace_file in trace_files): + logger.error("Some trace files are None", f"{trace_files=}") trace_files_relay_links = [ - f"[trace]({get_perfetto_relay_link_from_trace_file(trace_file)})" + ( + f"[trace]({get_perfetto_relay_link_from_trace_file(trace_file)})" + if trace_file + else "N/A" + ) for trace_file in trace_files ] @@ -114,30 +122,31 @@ def get_perfetto_relay_link_from_trace_file(trace_file: str): # Build the row return f"| {self.batch_size} | {self.input_len} | {self.latency:.2f} | {self.input_throughput:.2f} | {self.output_throughput:.2f} | {accept_length} | {itl:.2f} | {input_cost:.2f} | {output_cost:.2f} | {profile_link} |\n" - @classmethod - def generate_markdown_report( - cls, trace_dir, results: List["BenchmarkResult"] - ) -> str: - """Generate a markdown report from a list of BenchmarkResult object from a single run.""" - import os - summary = f"### {results[0].model_path}\n" +def generate_markdown_report(trace_dir, results: List["BenchmarkResult"]) -> str: + """Generate a markdown report from a list of BenchmarkResult object from a single run.""" + import os + + summary = f"### {results[0].model_path}\n" - # summary += ( - # f"Input lens: {result.input_len}. Output lens: {result.output_len}.\n" - # ) - summary += "| batch size | input len | latency (s) | input throughput (tok/s) | output throughput (tok/s) | acc length | ITL (ms) | input cost ($/1M) | output cost ($/1M) | profile (extend) | profile (decode)|\n" - summary += "| ---------- | --------- | ----------- | ------------------------- | ------------------------- | ---------- | -------- | ----------------- | ------------------ | --------------- | -------------- |\n" + # summary += ( + # f"Input lens: {result.input_len}. Output lens: {result.output_len}.\n" + # ) + summary += "| batch size | input len | latency (s) | input throughput (tok/s) | output throughput (tok/s) | acc length | ITL (ms) | input cost ($/1M) | output cost ($/1M) | profile (extend) | profile (decode)|\n" + summary += "| ---------- | --------- | ----------- | ------------------------- | ------------------------- | ---------- | -------- | ----------------- | ------------------ | --------------- | -------------- |\n" - # all results should share the same isl & osl - for result in results: - base_url = os.getenv("TRACE_BASE_URL", "").rstrip("/") - relay_base = os.getenv("PERFETTO_RELAY_URL", "").rstrip("/") - relay_base = "https://docs.sglang.ai/ci-data/pages/perfetto_relay.html" - # base_url = "https://github.com/sgl-project/ci-data/traces" - summary += result.to_markdown_row(trace_dir, base_url, relay_base) + # all results should share the same isl & osl + for result in results: + base_url = os.getenv( + "TRACE_BASE_URL", "https://github.com/sgl-project/ci-data/traces" + ).rstrip("/") + relay_base = os.getenv( + "PERFETTO_RELAY_URL", + "https://docs.sglang.ai/ci-data/pages/perfetto_relay.html", + ).rstrip("/") + summary += result.to_markdown_row(trace_dir, base_url, relay_base) - return summary + return summary @dataclasses.dataclass @@ -288,7 +297,7 @@ def run_one_case( input_len_step_percentage: float, run_name: str, result_filename: str, - tokenizer, + tokenizer: PreTrainedTokenizer | AutoProcessor, dataset_name="", profile: bool = False, profile_steps: int = 3, @@ -302,9 +311,8 @@ def run_one_case( if dataset_name == "mmmu": input_requests = sample_mmmu_requests( num_requests=batch_size, - tokenizer=tokenizer, + processor=tokenizer, fixed_output_len=output_len, - apply_chat_template=True, random_sample=False, ) elif dataset_name == "random": @@ -364,6 +372,8 @@ def run_one_case( if dataset_name == "mmmu": # vlm input_ids = [] + # for vlms, tokenizer is an instance of AutoProcessor + tokenizer = tokenizer.tokenizer for input_req in input_requests: input_ids += [tokenizer.encode(input_req.prompt)] payload["image_data"] = [req.image_data for req in input_requests] @@ -609,7 +619,12 @@ def run_benchmark(server_args: ServerArgs, bench_args: BenchArgs): tokenizer_path = server_info["tokenizer_path"] elif "prefill" in server_info: tokenizer_path = server_info["prefill"][0]["tokenizer_path"] - tokenizer = get_tokenizer(tokenizer_path) + + if bench_args.dataset_name == "mmmu": + # mmmu implies this is a MLLM + tokenizer = get_processor(tokenizer_path) + else: + tokenizer = get_tokenizer(tokenizer_path) # warmup if not bench_args.skip_warmup: diff --git a/python/sglang/bench_serving.py b/python/sglang/bench_serving.py index 3b411ae72c3c..4477e6427592 100644 --- a/python/sglang/bench_serving.py +++ b/python/sglang/bench_serving.py @@ -12,7 +12,6 @@ import argparse import asyncio -import base64 import io import json import os @@ -671,7 +670,7 @@ def get_processor( if pretrained_model_name_or_path.endswith( ".json" ) or pretrained_model_name_or_path.endswith(".model"): - from sglang.srt.hf_transformers_utils import get_processor + from sglang.srt.utils.hf_transformers_utils import get_processor return get_processor(pretrained_model_name_or_path) @@ -935,7 +934,7 @@ async def get_mooncake_request_over_time( for i in range(num_rounds): # Add user query for the current round chat_history.append( - {"role": "user", "content": f"Round {i+1}: {user_query_base}"} + {"role": "user", "content": f"Round {i + 1}: {user_query_base}"} ) # Form the full prompt from history @@ -964,7 +963,7 @@ async def get_mooncake_request_over_time( def sample_mmmu_requests( num_requests: int, - processor: AutoProcessor, + processor: AutoProcessor | AutoTokenizer, fixed_output_len: Optional[int] = None, random_sample: bool = True, ) -> List[DatasetRow]: @@ -973,9 +972,7 @@ def sample_mmmu_requests( Args: num_requests: Number of requests to sample. - tokenizer: Tokenizer to use for token counting. fixed_output_len: If provided, use this fixed output length for all requests. - apply_chat_template: Whether to apply the chat template to the prompt. random_sample: Whether to randomly sample or take the first N. Returns: @@ -1282,11 +1279,11 @@ def parse_image_resolution(image_resolution: str) -> Tuple[int, int]: ) -def create_mm_data_row(text_prompt, images, images_base64, output_len, processor): +def create_mm_data_row(text_prompt, images: list, images_base64, output_len, processor): try: content_items = [ - {"type": "image_url", "image_url": {"url": img_url}} - for img_url in images_base64 + {"type": "image", "image": {"url": image_base64}} + for image_base64 in images_base64 ] content_items.append({"type": "text", "text": text_prompt}) prompt_str = processor.apply_chat_template( @@ -1294,7 +1291,9 @@ def create_mm_data_row(text_prompt, images, images_base64, output_len, processor add_generation_prompt=True, tokenize=False, ) - except Exception: + except Exception as e: + # Note (Xinyuan): This is a workaround for an issue where some tokenizers do not support content as a list. (e.g. InternVL) + print(f"Error applying chat template: {e}, fallback to tag") # Some tokenizers do not support list content; fall back to a placeholder in the text prompt_str = f"{text_prompt}" @@ -1425,7 +1424,7 @@ def _gen_random_image_data_uri( print(f"#Input tokens: {np.sum([x.prompt_len for x in dataset])}") print(f"#Output tokens: {np.sum([x.output_len for x in dataset])}") print( - f"\nCreated {len(dataset)} {image_content} {image_format} images with average {total_image_bytes//num_requests} bytes per request" + f"\nCreated {len(dataset)} {image_content} {image_format} images with average {total_image_bytes // num_requests} bytes per request" ) return dataset diff --git a/test/srt/test_nightly_text_models_perf.py b/test/srt/test_nightly_text_models_perf.py index 999d26289492..76250aea7309 100644 --- a/test/srt/test_nightly_text_models_perf.py +++ b/test/srt/test_nightly_text_models_perf.py @@ -3,7 +3,7 @@ import time import unittest -from sglang.bench_one_batch_server import BenchmarkResult +from sglang.bench_one_batch_server import BenchmarkResult, generate_markdown_report from sglang.srt.utils import kill_process_tree from sglang.test.test_utils import ( DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, @@ -41,7 +41,7 @@ def setUpClass(cls): def test_bench_one_batch(self): all_benchmark_results = [] - + all_model_succeed = True for model_setup in self.models: benchmark_results = [] with self.subTest(model=model_setup.model_path): @@ -113,19 +113,21 @@ def test_bench_one_batch(self): # Clean up JSON file os.remove(json_output_file) else: + all_model_succeed = False print(f"Warning: JSON output file {json_output_file} not found") finally: kill_process_tree(process.pid) - report_part = BenchmarkResult.generate_markdown_report( - PROFILE_DIR, benchmark_results - ) + report_part = generate_markdown_report(PROFILE_DIR, benchmark_results) self.full_report += report_part + "\n" if is_in_ci(): write_github_step_summary(self.full_report) + if not all_model_succeed: + raise AssertionError("Some models failed the perf tests.") + if __name__ == "__main__": unittest.main() diff --git a/test/srt/test_nightly_vlms_mmmu_eval.py b/test/srt/test_nightly_vlms_mmmu_eval.py index 34ba4b31a26a..f6500040d956 100644 --- a/test/srt/test_nightly_vlms_mmmu_eval.py +++ b/test/srt/test_nightly_vlms_mmmu_eval.py @@ -1,7 +1,6 @@ import json import unittest import warnings -from functools import partial from types import SimpleNamespace from sglang.srt.utils import kill_process_tree @@ -26,16 +25,19 @@ "Efficient-Large-Model/NVILA-Lite-2B-hf-0626" ): ModelEvalMetrics(0.305, 23.8), ModelLaunchSettings("google/gemma-3-4b-it"): ModelEvalMetrics(0.360, 10.9), - ModelLaunchSettings("google/gemma-3n-E4B-it"): ModelEvalMetrics(0.360, 15.3), + ModelLaunchSettings("google/gemma-3n-E4B-it"): ModelEvalMetrics(0.360, 17.7), ModelLaunchSettings("mistral-community/pixtral-12b"): ModelEvalMetrics(0.360, 16.6), ModelLaunchSettings("moonshotai/Kimi-VL-A3B-Instruct"): ModelEvalMetrics( 0.330, 22.3 ), ModelLaunchSettings("openbmb/MiniCPM-o-2_6"): ModelEvalMetrics(0.330, 29.3), - ModelLaunchSettings("openbmb/MiniCPM-v-2_6"): ModelEvalMetrics(0.270, 24.5), - ModelLaunchSettings("OpenGVLab/InternVL2_5-2B"): ModelEvalMetrics(0.300, 14.0), + ModelLaunchSettings("openbmb/MiniCPM-v-2_6"): ModelEvalMetrics(0.259, 36.3), + ModelLaunchSettings("OpenGVLab/InternVL2_5-2B"): ModelEvalMetrics(0.300, 17.0), ModelLaunchSettings("Qwen/Qwen2-VL-7B-Instruct"): ModelEvalMetrics(0.310, 83.3), ModelLaunchSettings("Qwen/Qwen2.5-VL-7B-Instruct"): ModelEvalMetrics(0.340, 31.9), + ModelLaunchSettings( + "Qwen/Qwen3-VL-30B-A3B-Instruct", extra_args=["--tp=2"] + ): ModelEvalMetrics(0.29, 29.1), ModelLaunchSettings( "unsloth/Mistral-Small-3.1-24B-Instruct-2503" ): ModelEvalMetrics(0.310, 16.7), diff --git a/test/srt/test_nightly_vlms_perf.py b/test/srt/test_nightly_vlms_perf.py index 03d2e164af31..cf54bea169a4 100644 --- a/test/srt/test_nightly_vlms_perf.py +++ b/test/srt/test_nightly_vlms_perf.py @@ -3,7 +3,7 @@ import unittest import warnings -from sglang.bench_one_batch_server import BenchmarkResult +from sglang.bench_one_batch_server import BenchmarkResult, generate_markdown_report from sglang.srt.utils import kill_process_tree from sglang.test.test_utils import ( DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, @@ -27,6 +27,7 @@ ModelLaunchSettings( "google/gemma-3-27b-it", ), + ModelLaunchSettings("Qwen/Qwen3-VL-30B-A3B-Instruct", extra_args=["--tp=2"]), # "OpenGVLab/InternVL2_5-2B", # buggy in official transformers impl # "openbmb/MiniCPM-V-2_6", @@ -45,9 +46,7 @@ def setUpClass(cls): cls.models = [] model_paths = parse_models(nightly_vlm_models_str) for model_path in model_paths: - cls.models.append( - ModelLaunchSettings(model_path, extra_args=VLM_EXTRA_ARGS) - ) + cls.models.append(ModelLaunchSettings(model_path)) else: cls.models = MODEL_DEFAULTS @@ -60,6 +59,7 @@ def setUpClass(cls): def test_bench_one_batch(self): all_benchmark_results = [] + all_model_succeed = True for model_setup in self.models: benchmark_results = [] @@ -112,7 +112,6 @@ def test_bench_one_batch(self): f"Error running benchmark for {model_setup.model_path} with batch size:" ) print(result.stderr) - # Continue to next batch size even if one fails continue print(f"Output for {model_setup.model_path} with batch size:") @@ -136,19 +135,24 @@ def test_bench_one_batch(self): ) else: + all_model_succeed = False print(f"Warning: JSON output file {json_output_file} not found") finally: kill_process_tree(process.pid) - report_part = BenchmarkResult.generate_markdown_report( - PROFILE_DIR, benchmark_results + report_part = generate_markdown_report( + PROFILE_DIR, + benchmark_results, ) self.full_report += report_part + "\n" if is_in_ci(): write_github_step_summary(self.full_report) + if not all_model_succeed: + raise AssertionError("Some models failed the perf tests.") + if __name__ == "__main__": unittest.main()