From b79ea87781b28c79d6a9628b749947cef251609b Mon Sep 17 00:00:00 2001 From: Lifu Huang Date: Wed, 23 Jul 2025 22:22:09 -0700 Subject: [PATCH 1/6] Add perf tests for LoRA. --- .github/workflows/pr-test.yml | 7 ++ python/sglang/test/test_utils.py | 35 +++++- test/srt/test_bench_serving.py | 180 ++++++++++++++++++++++++++++--- 3 files changed, 201 insertions(+), 21 deletions(-) diff --git a/.github/workflows/pr-test.yml b/.github/workflows/pr-test.yml index 6c79b0ae63fa..c19d9d068e89 100644 --- a/.github/workflows/pr-test.yml +++ b/.github/workflows/pr-test.yml @@ -174,6 +174,13 @@ jobs: cd test/srt python3 -m unittest test_bench_serving.TestBenchServing.test_online_latency_eagle + - name: Benchmark online latency (LoRA) + timeout-minutes: 10 + run: | + cd test/srt + python3 -m unittest test_bench_serving.TestBenchServing.test_lora_online_latency + python3 -m unittest test_bench_serving.TestBenchServing.test_lora_online_latency_with_concurrent_adapter_updates + performance-test-1-gpu-part-2: if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && github.event.pull_request.draft == false diff --git a/python/sglang/test/test_utils.py b/python/sglang/test/test_utils.py index 132a796d498e..057bc5eb9fcf 100644 --- a/python/sglang/test/test_utils.py +++ b/python/sglang/test/test_utils.py @@ -1,6 +1,7 @@ """Common utilities for testing and benchmarking""" import argparse +import asyncio import copy import json import logging @@ -15,7 +16,7 @@ from dataclasses import dataclass from functools import partial from types import SimpleNamespace -from typing import Callable, List, Optional, Tuple +from typing import Awaitable, Callable, List, Optional, Tuple import numpy as np import requests @@ -714,6 +715,7 @@ def get_benchmark_args( seed: int = 0, device="auto", pd_separated: bool = False, + lora_name=None, ): return SimpleNamespace( backend="sglang", @@ -741,7 +743,7 @@ def get_benchmark_args( extra_request_body=None, apply_chat_template=False, profile=None, - lora_name=None, + lora_name=lora_name, prompt_suffix="", device=device, pd_separated=pd_separated, @@ -764,6 +766,8 @@ def run_bench_serving( need_warmup=False, seed: int = 0, device="auto", + background_task: Optional[Callable[[str, asyncio.Event], Awaitable[None]]] = None, + lora_name: Optional[str] = None, ): if device == "auto": device = auto_config_device() @@ -791,14 +795,35 @@ def run_bench_serving( disable_ignore_eos=disable_ignore_eos, seed=seed, device=device, + lora_name=lora_name, ) - try: + async def _run(): if need_warmup: warmup_args = copy.deepcopy(args) warmup_args.num_prompts = 16 - run_benchmark(warmup_args) - res = run_benchmark(args) + await asyncio.to_thread(run_benchmark, warmup_args) + + start_event = asyncio.Event() + stop_event = asyncio.Event() + task_handle = ( + asyncio.create_task(background_task(base_url, start_event, stop_event)) + if background_task + else None + ) + + try: + start_event.set() + result = await asyncio.to_thread(run_benchmark, args) + finally: + if task_handle: + stop_event.set() + await task_handle + + return result + + try: + res = asyncio.run(_run()) finally: kill_process_tree(process.pid) diff --git a/test/srt/test_bench_serving.py b/test/srt/test_bench_serving.py index 19936c574dff..c85300abdf3c 100644 --- a/test/srt/test_bench_serving.py +++ b/test/srt/test_bench_serving.py @@ -1,4 +1,9 @@ +import asyncio +import itertools import unittest +from random import random, uniform + +import requests from sglang.test.test_utils import ( DEFAULT_EAGLE_DRAFT_MODEL_FOR_TEST, @@ -16,7 +21,6 @@ class TestBenchServing(CustomTestCase): - def test_offline_throughput_default(self): res = run_bench_serving( model=DEFAULT_MODEL_NAME_FOR_TEST, @@ -28,7 +32,7 @@ def test_offline_throughput_default(self): if is_in_ci(): write_github_step_summary( f"### test_offline_throughput_default\n" - f'Output throughput: {res["output_throughput"]:.2f} token/s\n' + f"Output throughput: {res['output_throughput']:.2f} token/s\n" ) if is_in_amd_ci(): self.assertGreater(res["output_throughput"], 3050) @@ -51,7 +55,7 @@ def test_offline_throughput_non_stream_small_batch_size(self): if is_in_ci(): write_github_step_summary( f"### test_offline_throughput_non_stream_small_batch_size\n" - f'Output throughput: {res["output_throughput"]:.2f} token/s\n' + f"Output throughput: {res['output_throughput']:.2f} token/s\n" ) self.assertGreater(res["output_throughput"], 1050) @@ -66,7 +70,7 @@ def test_offline_throughput_without_radix_cache(self): if is_in_ci(): write_github_step_summary( f"### test_offline_throughput_without_radix_cache\n" - f'Output throughput: {res["output_throughput"]:.2f} token/s\n' + f"Output throughput: {res['output_throughput']:.2f} token/s\n" ) if is_in_amd_ci(): self.assertGreater(res["output_throughput"], 3050) @@ -84,7 +88,7 @@ def test_offline_throughput_without_chunked_prefill(self): if is_in_ci(): write_github_step_summary( f"### test_offline_throughput_without_chunked_prefill\n" - f'Output throughput: {res["output_throughput"]:.2f} token/s\n' + f"Output throughput: {res['output_throughput']:.2f} token/s\n" ) self.assertGreater(res["output_throughput"], 2600) @@ -104,7 +108,7 @@ def test_offline_throughput_with_triton_attention_backend(self): if is_in_ci(): write_github_step_summary( f"### test_offline_throughput_with_triton_attention_backend\n" - f'Output throughput: {res["output_throughput"]:.2f} token/s\n' + f"Output throughput: {res['output_throughput']:.2f} token/s\n" ) if is_in_amd_ci(): self.assertGreater(res["output_throughput"], 3500) @@ -122,7 +126,7 @@ def test_offline_throughput_default_fp8(self): if is_in_ci(): write_github_step_summary( f"### test_offline_throughput_default_fp8\n" - f'Output throughput: {res["output_throughput"]:.2f} token/s\n' + f"Output throughput: {res['output_throughput']:.2f} token/s\n" ) if is_in_amd_ci(): self.assertGreater(res["output_throughput"], 3500) @@ -140,7 +144,7 @@ def test_online_latency_default(self): if is_in_ci(): write_github_step_summary( f"### test_online_latency_default\n" - f'median_e2e_latency_ms: {res["median_e2e_latency_ms"]:.2f} ms\n' + f"median_e2e_latency_ms: {res['median_e2e_latency_ms']:.2f} ms\n" ) self.assertLess(res["median_e2e_latency_ms"], 11000) if is_in_amd_ci(): @@ -164,7 +168,7 @@ def test_vlm_offline_throughput(self): if is_in_ci(): write_github_step_summary( f"### test_vlm_offline_throughput\n" - f'Output throughput: {res["output_throughput"]:.2f} token/s\n' + f"Output throughput: {res['output_throughput']:.2f} token/s\n" ) if is_in_amd_ci(): self.assertGreater(res["output_throughput"], 2000) @@ -187,7 +191,7 @@ def test_vlm_online_latency(self): if is_in_ci(): write_github_step_summary( f"### test_vlm_online_latency\n" - f'median_e2e_latency_ms: {res["median_e2e_latency_ms"]:.2f} ms\n' + f"median_e2e_latency_ms: {res['median_e2e_latency_ms']:.2f} ms\n" ) self.assertLess(res["median_e2e_latency_ms"], 16500) if is_in_amd_ci(): @@ -197,6 +201,150 @@ def test_vlm_online_latency(self): self.assertLess(res["median_ttft_ms"], 100) self.assertLess(res["median_itl_ms"], 8) + def test_lora_online_latency(self): + # TODO (lifuhuang): verify LoRA support in AMD. + if is_in_amd_ci(): + pass + + res = self._run_lora_latency_test(enable_background_task=False) + + result = ( + f"### test_lora_online_latency\n" + f"median_e2e_latency_ms: {res['median_e2e_latency_ms']:.2f} ms\n" + f"median_ttft_ms: {res['median_ttft_ms']:.2f} ms\n" + ) + + print(result) + if is_in_ci(): + write_github_step_summary(result) + + self.assertLess(res["median_e2e_latency_ms"], 8800) + self.assertLess(res["median_ttft_ms"], 130) + + def test_lora_online_latency_with_concurrent_adapter_updates(self): + # TODO (lifuhuang): verify LoRA support in AMD. + if is_in_amd_ci(): + pass + + res = self._run_lora_latency_test(enable_background_task=True) + + result = ( + f"### test_lora_latency_with_concurrent_adapter_updates\n" + f"median_e2e_latency_ms: {res['median_e2e_latency_ms']:.2f} ms\n" + f"median_ttft_ms: {res['median_ttft_ms']:.2f} ms\n" + ) + + print(result) + if is_in_ci(): + write_github_step_summary(result) + + self.assertLess(res["median_e2e_latency_ms"], 12000) + + # TODO (lifuhuang): This will be fixed by the overlapped LoRA update in a separate PR. + self.assertLess(res["median_ttft_ms"], 5500) + + def _run_lora_latency_test(self, enable_background_task: bool): + """ + Run a latency test for LoRA with the specified background task setting. + """ + + async def lora_loader_unloader_task( + base_url: str, + start_event: asyncio.Event, + stop_event: asyncio.Event, + ): + """ + A background task that repeatedly loads and unloads a LoRA adapter. + """ + print("BACKGROUND: Waiting for start_event...") + await start_event.wait() + print("BACKGROUND: Starting LoRA loader/unloader task...") + + path_cycler = itertools.cycle( + [ + "pbevan11/llama-3.1-8b-ocr-correction", + "faridlazuarda/valadapt-llama-3.1-8B-it-chinese", + "philschmid/code-llama-3-1-8b-text-to-sql-lora", + ] + ) + load_url = f"{base_url}/load_lora_adapter" + unload_url = f"{base_url}/unload_lora_adapter" + num_updates = 0 + + while not stop_event.is_set(): + # 1. Load the LoRA adapter + lora_path = next(path_cycler) + print(f"BACKGROUND: Loading LoRA '{lora_path}'...") + response = await asyncio.to_thread( + requests.post, + load_url, + json={"lora_name": lora_path, "lora_path": lora_path}, + ) + self.assertTrue( + response.ok, f"Failed to load LoRA adapter: {response.text}" + ) + num_updates += 1 + loaded_adapters = set(response.json()["loaded_adapters"]) + print(f"loaded_adapters: {loaded_adapters}") + + if stop_event.is_set(): + break + + # Yield control to allow other tasks to run. + await asyncio.sleep(1) + + # 2. Unload the LoRA adapter + response = await asyncio.to_thread( + requests.post, + unload_url, + json={"lora_name": lora_path}, + ) + self.assertTrue( + response.ok, f"Failed to unload LoRA adapter: {response.text}" + ) + num_updates += 1 + loaded_adapters = set(response.json()["loaded_adapters"]) + print(f"loaded_adapters: {loaded_adapters}") + + # Yield control to allow other tasks to run. + await asyncio.sleep(1) + + print( + f"BACKGROUND: LoRA loader/unloader task stopped. Total updates: {num_updates}." + ) + + background_task = lora_loader_unloader_task if enable_background_task else None + res = run_bench_serving( + model=DEFAULT_MODEL_NAME_FOR_TEST, + num_prompts=400, + request_rate=8, + other_server_args=[ + "--enable-lora", + "--max-loras-per-batch", + "1", + "--disable-radix-cache", + "--random-seed", + "42", + "--mem-fraction-static", + "0.8", + "--lora-paths", + "Nutanix/Meta-Llama-3.1-8B-Instruct_lora_4_alpha_16", + "--max-lora-rank", + "256", + "--cuda-graph-max-bs", + "8", + "--log-level", + "warning", + ], + dataset_name="random", + random_input_len=256, + random_output_len=256, + lora_name=["Nutanix/Meta-Llama-3.1-8B-Instruct_lora_4_alpha_16"], + background_task=background_task, + ) + + return res + def test_online_latency_eagle(self): res = run_bench_serving( model=DEFAULT_EAGLE_TARGET_MODEL_FOR_TEST, @@ -226,8 +374,8 @@ def test_online_latency_eagle(self): if is_in_ci(): write_github_step_summary( f"### test_online_latency_eagle\n" - f'median_e2e_latency_ms: {res["median_e2e_latency_ms"]:.2f} ms\n' - f'accept_length: {res["accept_length"]:.2f} \n' + f"median_e2e_latency_ms: {res['median_e2e_latency_ms']:.2f} ms\n" + f"accept_length: {res['accept_length']:.2f} \n" ) if is_in_amd_ci(): self.assertLess(res["median_e2e_latency_ms"], 1800) @@ -246,7 +394,7 @@ def test_moe_offline_throughput_default(self): if is_in_ci(): write_github_step_summary( f"### test_moe_offline_throughput_default\n" - f'Output throughput: {res["output_throughput"]:.2f} token/s\n' + f"Output throughput: {res['output_throughput']:.2f} token/s\n" ) if is_in_amd_ci(): self.assertGreater(res["output_throughput"], 2100) @@ -264,7 +412,7 @@ def test_moe_offline_throughput_without_radix_cache(self): if is_in_ci(): write_github_step_summary( f"### test_moe_offline_throughput_without_radix_cache\n" - f'Output throughput: {res["output_throughput"]:.2f} token/s\n' + f"Output throughput: {res['output_throughput']:.2f} token/s\n" ) if is_in_amd_ci(): self.assertGreater(res["output_throughput"], 2100) @@ -286,7 +434,7 @@ def test_pp_offline_throughput_default_decode(self): if is_in_ci(): write_github_step_summary( f"### test_pp_offline_throughput_default_decode\n" - f'Output throughput: {res["output_throughput"]:.2f} token/s\n' + f"Output throughput: {res['output_throughput']:.2f} token/s\n" ) self.assertGreater(res["output_throughput"], 6700) @@ -311,7 +459,7 @@ def test_pp_long_context_prefill(self): if is_in_ci(): write_github_step_summary( f"### test_pp_long_context_latency_prefill\n" - f'input_throughput: {res["input_throughput"]:.2f} ms\n' + f"input_throughput: {res['input_throughput']:.2f} ms\n" ) self.assertGreater(res["input_throughput"], 4000) From 336dcd9ca88c6c2309758d4b4e6373e7b24a1a5e Mon Sep 17 00:00:00 2001 From: Lifu Huang Date: Thu, 24 Jul 2025 02:19:12 -0700 Subject: [PATCH 2/6] Update threshold based on repeated CI runs --- test/srt/test_bench_serving.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/test/srt/test_bench_serving.py b/test/srt/test_bench_serving.py index c85300abdf3c..41705fae7a73 100644 --- a/test/srt/test_bench_serving.py +++ b/test/srt/test_bench_serving.py @@ -218,8 +218,8 @@ def test_lora_online_latency(self): if is_in_ci(): write_github_step_summary(result) - self.assertLess(res["median_e2e_latency_ms"], 8800) - self.assertLess(res["median_ttft_ms"], 130) + self.assertLess(res["median_e2e_latency_ms"], 5400) + self.assertLess(res["median_ttft_ms"], 88) def test_lora_online_latency_with_concurrent_adapter_updates(self): # TODO (lifuhuang): verify LoRA support in AMD. @@ -238,10 +238,10 @@ def test_lora_online_latency_with_concurrent_adapter_updates(self): if is_in_ci(): write_github_step_summary(result) - self.assertLess(res["median_e2e_latency_ms"], 12000) + self.assertLess(res["median_e2e_latency_ms"], 7600) # TODO (lifuhuang): This will be fixed by the overlapped LoRA update in a separate PR. - self.assertLess(res["median_ttft_ms"], 5500) + self.assertLess(res["median_ttft_ms"], 3300) def _run_lora_latency_test(self, enable_background_task: bool): """ From befbada5ac62bee9dc341df7eb41d2e72af3cbf8 Mon Sep 17 00:00:00 2001 From: Lifu Huang Date: Fri, 25 Jul 2025 23:04:42 -0700 Subject: [PATCH 3/6] Address PR comments. --- test/srt/test_bench_serving.py | 54 ++++++++++------------------------ 1 file changed, 16 insertions(+), 38 deletions(-) diff --git a/test/srt/test_bench_serving.py b/test/srt/test_bench_serving.py index 41705fae7a73..0628c95850b8 100644 --- a/test/srt/test_bench_serving.py +++ b/test/srt/test_bench_serving.py @@ -208,18 +208,14 @@ def test_lora_online_latency(self): res = self._run_lora_latency_test(enable_background_task=False) - result = ( - f"### test_lora_online_latency\n" - f"median_e2e_latency_ms: {res['median_e2e_latency_ms']:.2f} ms\n" - f"median_ttft_ms: {res['median_ttft_ms']:.2f} ms\n" - ) - - print(result) if is_in_ci(): - write_github_step_summary(result) - - self.assertLess(res["median_e2e_latency_ms"], 5400) - self.assertLess(res["median_ttft_ms"], 88) + write_github_step_summary( + f"### test_lora_online_latency\n" + f"median_e2e_latency_ms: {res['median_e2e_latency_ms']:.2f} ms\n" + f"median_ttft_ms: {res['median_ttft_ms']:.2f} ms\n" + ) + self.assertLess(res["median_e2e_latency_ms"], 5400) + self.assertLess(res["median_ttft_ms"], 88) def test_lora_online_latency_with_concurrent_adapter_updates(self): # TODO (lifuhuang): verify LoRA support in AMD. @@ -228,20 +224,15 @@ def test_lora_online_latency_with_concurrent_adapter_updates(self): res = self._run_lora_latency_test(enable_background_task=True) - result = ( - f"### test_lora_latency_with_concurrent_adapter_updates\n" - f"median_e2e_latency_ms: {res['median_e2e_latency_ms']:.2f} ms\n" - f"median_ttft_ms: {res['median_ttft_ms']:.2f} ms\n" - ) - - print(result) if is_in_ci(): - write_github_step_summary(result) - - self.assertLess(res["median_e2e_latency_ms"], 7600) - - # TODO (lifuhuang): This will be fixed by the overlapped LoRA update in a separate PR. - self.assertLess(res["median_ttft_ms"], 3300) + write_github_step_summary( + f"### test_lora_online_latency\n" + f"median_e2e_latency_ms: {res['median_e2e_latency_ms']:.2f} ms\n" + f"median_ttft_ms: {res['median_ttft_ms']:.2f} ms\n" + ) + self.assertLess(res["median_e2e_latency_ms"], 7600) + # TODO (lifuhuang): This will be fixed by the overlapped LoRA update in a separate PR. + self.assertLess(res["median_ttft_ms"], 3300) def _run_lora_latency_test(self, enable_background_task: bool): """ @@ -256,9 +247,7 @@ async def lora_loader_unloader_task( """ A background task that repeatedly loads and unloads a LoRA adapter. """ - print("BACKGROUND: Waiting for start_event...") await start_event.wait() - print("BACKGROUND: Starting LoRA loader/unloader task...") path_cycler = itertools.cycle( [ @@ -274,7 +263,6 @@ async def lora_loader_unloader_task( while not stop_event.is_set(): # 1. Load the LoRA adapter lora_path = next(path_cycler) - print(f"BACKGROUND: Loading LoRA '{lora_path}'...") response = await asyncio.to_thread( requests.post, load_url, @@ -284,8 +272,6 @@ async def lora_loader_unloader_task( response.ok, f"Failed to load LoRA adapter: {response.text}" ) num_updates += 1 - loaded_adapters = set(response.json()["loaded_adapters"]) - print(f"loaded_adapters: {loaded_adapters}") if stop_event.is_set(): break @@ -303,16 +289,10 @@ async def lora_loader_unloader_task( response.ok, f"Failed to unload LoRA adapter: {response.text}" ) num_updates += 1 - loaded_adapters = set(response.json()["loaded_adapters"]) - print(f"loaded_adapters: {loaded_adapters}") # Yield control to allow other tasks to run. await asyncio.sleep(1) - print( - f"BACKGROUND: LoRA loader/unloader task stopped. Total updates: {num_updates}." - ) - background_task = lora_loader_unloader_task if enable_background_task else None res = run_bench_serving( model=DEFAULT_MODEL_NAME_FOR_TEST, @@ -332,9 +312,7 @@ async def lora_loader_unloader_task( "--max-lora-rank", "256", "--cuda-graph-max-bs", - "8", - "--log-level", - "warning", + "32", ], dataset_name="random", random_input_len=256, From cec8ffa12a67239826cb2d04519e19feb75ff8b5 Mon Sep 17 00:00:00 2001 From: Lifu Huang Date: Fri, 25 Jul 2025 23:17:19 -0700 Subject: [PATCH 4/6] Remove max cuda graph limitation. --- test/srt/test_bench_serving.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/test/srt/test_bench_serving.py b/test/srt/test_bench_serving.py index 0628c95850b8..84ad1e133537 100644 --- a/test/srt/test_bench_serving.py +++ b/test/srt/test_bench_serving.py @@ -311,8 +311,6 @@ async def lora_loader_unloader_task( "Nutanix/Meta-Llama-3.1-8B-Instruct_lora_4_alpha_16", "--max-lora-rank", "256", - "--cuda-graph-max-bs", - "32", ], dataset_name="random", random_input_len=256, From 76e3bd2384db2661355aa203cea5ef2ec291e250 Mon Sep 17 00:00:00 2001 From: Lifu Huang Date: Sat, 26 Jul 2025 01:37:20 -0700 Subject: [PATCH 5/6] Update threshold based on CI result. --- test/srt/test_bench_serving.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/test/srt/test_bench_serving.py b/test/srt/test_bench_serving.py index 84ad1e133537..f21a5322922c 100644 --- a/test/srt/test_bench_serving.py +++ b/test/srt/test_bench_serving.py @@ -214,8 +214,8 @@ def test_lora_online_latency(self): f"median_e2e_latency_ms: {res['median_e2e_latency_ms']:.2f} ms\n" f"median_ttft_ms: {res['median_ttft_ms']:.2f} ms\n" ) - self.assertLess(res["median_e2e_latency_ms"], 5400) - self.assertLess(res["median_ttft_ms"], 88) + self.assertLess(res["median_e2e_latency_ms"], 2400) + self.assertLess(res["median_ttft_ms"], 55) def test_lora_online_latency_with_concurrent_adapter_updates(self): # TODO (lifuhuang): verify LoRA support in AMD. @@ -230,9 +230,9 @@ def test_lora_online_latency_with_concurrent_adapter_updates(self): f"median_e2e_latency_ms: {res['median_e2e_latency_ms']:.2f} ms\n" f"median_ttft_ms: {res['median_ttft_ms']:.2f} ms\n" ) - self.assertLess(res["median_e2e_latency_ms"], 7600) + self.assertLess(res["median_e2e_latency_ms"], 4000) # TODO (lifuhuang): This will be fixed by the overlapped LoRA update in a separate PR. - self.assertLess(res["median_ttft_ms"], 3300) + self.assertLess(res["median_ttft_ms"], 1600) def _run_lora_latency_test(self, enable_background_task: bool): """ From f4bfef1e30696a0e25e0b4e1566c8b71a42e1801 Mon Sep 17 00:00:00 2001 From: Lifu Huang Date: Sat, 26 Jul 2025 02:14:17 -0700 Subject: [PATCH 6/6] Update test_bench_serving.py --- test/srt/test_bench_serving.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/srt/test_bench_serving.py b/test/srt/test_bench_serving.py index f21a5322922c..ee1346e1c18b 100644 --- a/test/srt/test_bench_serving.py +++ b/test/srt/test_bench_serving.py @@ -215,7 +215,7 @@ def test_lora_online_latency(self): f"median_ttft_ms: {res['median_ttft_ms']:.2f} ms\n" ) self.assertLess(res["median_e2e_latency_ms"], 2400) - self.assertLess(res["median_ttft_ms"], 55) + self.assertLess(res["median_ttft_ms"], 58) def test_lora_online_latency_with_concurrent_adapter_updates(self): # TODO (lifuhuang): verify LoRA support in AMD.