diff --git a/.github/workflows/pr-test.yml b/.github/workflows/pr-test.yml index 6c79b0ae63fa..c19d9d068e89 100644 --- a/.github/workflows/pr-test.yml +++ b/.github/workflows/pr-test.yml @@ -174,6 +174,13 @@ jobs: cd test/srt python3 -m unittest test_bench_serving.TestBenchServing.test_online_latency_eagle + - name: Benchmark online latency (LoRA) + timeout-minutes: 10 + run: | + cd test/srt + python3 -m unittest test_bench_serving.TestBenchServing.test_lora_online_latency + python3 -m unittest test_bench_serving.TestBenchServing.test_lora_online_latency_with_concurrent_adapter_updates + performance-test-1-gpu-part-2: if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && github.event.pull_request.draft == false diff --git a/python/sglang/test/test_utils.py b/python/sglang/test/test_utils.py index 132a796d498e..057bc5eb9fcf 100644 --- a/python/sglang/test/test_utils.py +++ b/python/sglang/test/test_utils.py @@ -1,6 +1,7 @@ """Common utilities for testing and benchmarking""" import argparse +import asyncio import copy import json import logging @@ -15,7 +16,7 @@ from dataclasses import dataclass from functools import partial from types import SimpleNamespace -from typing import Callable, List, Optional, Tuple +from typing import Awaitable, Callable, List, Optional, Tuple import numpy as np import requests @@ -714,6 +715,7 @@ def get_benchmark_args( seed: int = 0, device="auto", pd_separated: bool = False, + lora_name=None, ): return SimpleNamespace( backend="sglang", @@ -741,7 +743,7 @@ def get_benchmark_args( extra_request_body=None, apply_chat_template=False, profile=None, - lora_name=None, + lora_name=lora_name, prompt_suffix="", device=device, pd_separated=pd_separated, @@ -764,6 +766,8 @@ def run_bench_serving( need_warmup=False, seed: int = 0, device="auto", + background_task: Optional[Callable[[str, asyncio.Event], Awaitable[None]]] = None, + lora_name: Optional[str] = None, ): if device == "auto": device = auto_config_device() @@ -791,14 +795,35 @@ def run_bench_serving( disable_ignore_eos=disable_ignore_eos, seed=seed, device=device, + lora_name=lora_name, ) - try: + async def _run(): if need_warmup: warmup_args = copy.deepcopy(args) warmup_args.num_prompts = 16 - run_benchmark(warmup_args) - res = run_benchmark(args) + await asyncio.to_thread(run_benchmark, warmup_args) + + start_event = asyncio.Event() + stop_event = asyncio.Event() + task_handle = ( + asyncio.create_task(background_task(base_url, start_event, stop_event)) + if background_task + else None + ) + + try: + start_event.set() + result = await asyncio.to_thread(run_benchmark, args) + finally: + if task_handle: + stop_event.set() + await task_handle + + return result + + try: + res = asyncio.run(_run()) finally: kill_process_tree(process.pid) diff --git a/test/srt/test_bench_serving.py b/test/srt/test_bench_serving.py index 19936c574dff..ee1346e1c18b 100644 --- a/test/srt/test_bench_serving.py +++ b/test/srt/test_bench_serving.py @@ -1,4 +1,9 @@ +import asyncio +import itertools import unittest +from random import random, uniform + +import requests from sglang.test.test_utils import ( DEFAULT_EAGLE_DRAFT_MODEL_FOR_TEST, @@ -16,7 +21,6 @@ class TestBenchServing(CustomTestCase): - def test_offline_throughput_default(self): res = run_bench_serving( model=DEFAULT_MODEL_NAME_FOR_TEST, @@ -28,7 +32,7 @@ def test_offline_throughput_default(self): if is_in_ci(): write_github_step_summary( f"### test_offline_throughput_default\n" - f'Output throughput: {res["output_throughput"]:.2f} token/s\n' + f"Output throughput: {res['output_throughput']:.2f} token/s\n" ) if is_in_amd_ci(): self.assertGreater(res["output_throughput"], 3050) @@ -51,7 +55,7 @@ def test_offline_throughput_non_stream_small_batch_size(self): if is_in_ci(): write_github_step_summary( f"### test_offline_throughput_non_stream_small_batch_size\n" - f'Output throughput: {res["output_throughput"]:.2f} token/s\n' + f"Output throughput: {res['output_throughput']:.2f} token/s\n" ) self.assertGreater(res["output_throughput"], 1050) @@ -66,7 +70,7 @@ def test_offline_throughput_without_radix_cache(self): if is_in_ci(): write_github_step_summary( f"### test_offline_throughput_without_radix_cache\n" - f'Output throughput: {res["output_throughput"]:.2f} token/s\n' + f"Output throughput: {res['output_throughput']:.2f} token/s\n" ) if is_in_amd_ci(): self.assertGreater(res["output_throughput"], 3050) @@ -84,7 +88,7 @@ def test_offline_throughput_without_chunked_prefill(self): if is_in_ci(): write_github_step_summary( f"### test_offline_throughput_without_chunked_prefill\n" - f'Output throughput: {res["output_throughput"]:.2f} token/s\n' + f"Output throughput: {res['output_throughput']:.2f} token/s\n" ) self.assertGreater(res["output_throughput"], 2600) @@ -104,7 +108,7 @@ def test_offline_throughput_with_triton_attention_backend(self): if is_in_ci(): write_github_step_summary( f"### test_offline_throughput_with_triton_attention_backend\n" - f'Output throughput: {res["output_throughput"]:.2f} token/s\n' + f"Output throughput: {res['output_throughput']:.2f} token/s\n" ) if is_in_amd_ci(): self.assertGreater(res["output_throughput"], 3500) @@ -122,7 +126,7 @@ def test_offline_throughput_default_fp8(self): if is_in_ci(): write_github_step_summary( f"### test_offline_throughput_default_fp8\n" - f'Output throughput: {res["output_throughput"]:.2f} token/s\n' + f"Output throughput: {res['output_throughput']:.2f} token/s\n" ) if is_in_amd_ci(): self.assertGreater(res["output_throughput"], 3500) @@ -140,7 +144,7 @@ def test_online_latency_default(self): if is_in_ci(): write_github_step_summary( f"### test_online_latency_default\n" - f'median_e2e_latency_ms: {res["median_e2e_latency_ms"]:.2f} ms\n' + f"median_e2e_latency_ms: {res['median_e2e_latency_ms']:.2f} ms\n" ) self.assertLess(res["median_e2e_latency_ms"], 11000) if is_in_amd_ci(): @@ -164,7 +168,7 @@ def test_vlm_offline_throughput(self): if is_in_ci(): write_github_step_summary( f"### test_vlm_offline_throughput\n" - f'Output throughput: {res["output_throughput"]:.2f} token/s\n' + f"Output throughput: {res['output_throughput']:.2f} token/s\n" ) if is_in_amd_ci(): self.assertGreater(res["output_throughput"], 2000) @@ -187,7 +191,7 @@ def test_vlm_online_latency(self): if is_in_ci(): write_github_step_summary( f"### test_vlm_online_latency\n" - f'median_e2e_latency_ms: {res["median_e2e_latency_ms"]:.2f} ms\n' + f"median_e2e_latency_ms: {res['median_e2e_latency_ms']:.2f} ms\n" ) self.assertLess(res["median_e2e_latency_ms"], 16500) if is_in_amd_ci(): @@ -197,6 +201,126 @@ def test_vlm_online_latency(self): self.assertLess(res["median_ttft_ms"], 100) self.assertLess(res["median_itl_ms"], 8) + def test_lora_online_latency(self): + # TODO (lifuhuang): verify LoRA support in AMD. + if is_in_amd_ci(): + pass + + res = self._run_lora_latency_test(enable_background_task=False) + + if is_in_ci(): + write_github_step_summary( + f"### test_lora_online_latency\n" + f"median_e2e_latency_ms: {res['median_e2e_latency_ms']:.2f} ms\n" + f"median_ttft_ms: {res['median_ttft_ms']:.2f} ms\n" + ) + self.assertLess(res["median_e2e_latency_ms"], 2400) + self.assertLess(res["median_ttft_ms"], 58) + + def test_lora_online_latency_with_concurrent_adapter_updates(self): + # TODO (lifuhuang): verify LoRA support in AMD. + if is_in_amd_ci(): + pass + + res = self._run_lora_latency_test(enable_background_task=True) + + if is_in_ci(): + write_github_step_summary( + f"### test_lora_online_latency\n" + f"median_e2e_latency_ms: {res['median_e2e_latency_ms']:.2f} ms\n" + f"median_ttft_ms: {res['median_ttft_ms']:.2f} ms\n" + ) + self.assertLess(res["median_e2e_latency_ms"], 4000) + # TODO (lifuhuang): This will be fixed by the overlapped LoRA update in a separate PR. + self.assertLess(res["median_ttft_ms"], 1600) + + def _run_lora_latency_test(self, enable_background_task: bool): + """ + Run a latency test for LoRA with the specified background task setting. + """ + + async def lora_loader_unloader_task( + base_url: str, + start_event: asyncio.Event, + stop_event: asyncio.Event, + ): + """ + A background task that repeatedly loads and unloads a LoRA adapter. + """ + await start_event.wait() + + path_cycler = itertools.cycle( + [ + "pbevan11/llama-3.1-8b-ocr-correction", + "faridlazuarda/valadapt-llama-3.1-8B-it-chinese", + "philschmid/code-llama-3-1-8b-text-to-sql-lora", + ] + ) + load_url = f"{base_url}/load_lora_adapter" + unload_url = f"{base_url}/unload_lora_adapter" + num_updates = 0 + + while not stop_event.is_set(): + # 1. Load the LoRA adapter + lora_path = next(path_cycler) + response = await asyncio.to_thread( + requests.post, + load_url, + json={"lora_name": lora_path, "lora_path": lora_path}, + ) + self.assertTrue( + response.ok, f"Failed to load LoRA adapter: {response.text}" + ) + num_updates += 1 + + if stop_event.is_set(): + break + + # Yield control to allow other tasks to run. + await asyncio.sleep(1) + + # 2. Unload the LoRA adapter + response = await asyncio.to_thread( + requests.post, + unload_url, + json={"lora_name": lora_path}, + ) + self.assertTrue( + response.ok, f"Failed to unload LoRA adapter: {response.text}" + ) + num_updates += 1 + + # Yield control to allow other tasks to run. + await asyncio.sleep(1) + + background_task = lora_loader_unloader_task if enable_background_task else None + res = run_bench_serving( + model=DEFAULT_MODEL_NAME_FOR_TEST, + num_prompts=400, + request_rate=8, + other_server_args=[ + "--enable-lora", + "--max-loras-per-batch", + "1", + "--disable-radix-cache", + "--random-seed", + "42", + "--mem-fraction-static", + "0.8", + "--lora-paths", + "Nutanix/Meta-Llama-3.1-8B-Instruct_lora_4_alpha_16", + "--max-lora-rank", + "256", + ], + dataset_name="random", + random_input_len=256, + random_output_len=256, + lora_name=["Nutanix/Meta-Llama-3.1-8B-Instruct_lora_4_alpha_16"], + background_task=background_task, + ) + + return res + def test_online_latency_eagle(self): res = run_bench_serving( model=DEFAULT_EAGLE_TARGET_MODEL_FOR_TEST, @@ -226,8 +350,8 @@ def test_online_latency_eagle(self): if is_in_ci(): write_github_step_summary( f"### test_online_latency_eagle\n" - f'median_e2e_latency_ms: {res["median_e2e_latency_ms"]:.2f} ms\n' - f'accept_length: {res["accept_length"]:.2f} \n' + f"median_e2e_latency_ms: {res['median_e2e_latency_ms']:.2f} ms\n" + f"accept_length: {res['accept_length']:.2f} \n" ) if is_in_amd_ci(): self.assertLess(res["median_e2e_latency_ms"], 1800) @@ -246,7 +370,7 @@ def test_moe_offline_throughput_default(self): if is_in_ci(): write_github_step_summary( f"### test_moe_offline_throughput_default\n" - f'Output throughput: {res["output_throughput"]:.2f} token/s\n' + f"Output throughput: {res['output_throughput']:.2f} token/s\n" ) if is_in_amd_ci(): self.assertGreater(res["output_throughput"], 2100) @@ -264,7 +388,7 @@ def test_moe_offline_throughput_without_radix_cache(self): if is_in_ci(): write_github_step_summary( f"### test_moe_offline_throughput_without_radix_cache\n" - f'Output throughput: {res["output_throughput"]:.2f} token/s\n' + f"Output throughput: {res['output_throughput']:.2f} token/s\n" ) if is_in_amd_ci(): self.assertGreater(res["output_throughput"], 2100) @@ -286,7 +410,7 @@ def test_pp_offline_throughput_default_decode(self): if is_in_ci(): write_github_step_summary( f"### test_pp_offline_throughput_default_decode\n" - f'Output throughput: {res["output_throughput"]:.2f} token/s\n' + f"Output throughput: {res['output_throughput']:.2f} token/s\n" ) self.assertGreater(res["output_throughput"], 6700) @@ -311,7 +435,7 @@ def test_pp_long_context_prefill(self): if is_in_ci(): write_github_step_summary( f"### test_pp_long_context_latency_prefill\n" - f'input_throughput: {res["input_throughput"]:.2f} ms\n' + f"input_throughput: {res['input_throughput']:.2f} ms\n" ) self.assertGreater(res["input_throughput"], 4000)