Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions .github/workflows/pr-test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -174,6 +174,13 @@ jobs:
cd test/srt
python3 -m unittest test_bench_serving.TestBenchServing.test_online_latency_eagle

- name: Benchmark online latency (LoRA)
timeout-minutes: 10
run: |
cd test/srt
python3 -m unittest test_bench_serving.TestBenchServing.test_lora_online_latency
python3 -m unittest test_bench_serving.TestBenchServing.test_lora_online_latency_with_concurrent_adapter_updates

performance-test-1-gpu-part-2:
if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') &&
github.event.pull_request.draft == false
Expand Down
35 changes: 30 additions & 5 deletions python/sglang/test/test_utils.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
"""Common utilities for testing and benchmarking"""

import argparse
import asyncio
import copy
import json
import logging
Expand All @@ -15,7 +16,7 @@
from dataclasses import dataclass
from functools import partial
from types import SimpleNamespace
from typing import Callable, List, Optional, Tuple
from typing import Awaitable, Callable, List, Optional, Tuple

import numpy as np
import requests
Expand Down Expand Up @@ -714,6 +715,7 @@ def get_benchmark_args(
seed: int = 0,
device="auto",
pd_separated: bool = False,
lora_name=None,
):
return SimpleNamespace(
backend="sglang",
Expand Down Expand Up @@ -741,7 +743,7 @@ def get_benchmark_args(
extra_request_body=None,
apply_chat_template=False,
profile=None,
lora_name=None,
lora_name=lora_name,
prompt_suffix="",
device=device,
pd_separated=pd_separated,
Expand All @@ -764,6 +766,8 @@ def run_bench_serving(
need_warmup=False,
seed: int = 0,
device="auto",
background_task: Optional[Callable[[str, asyncio.Event], Awaitable[None]]] = None,
lora_name: Optional[str] = None,
):
if device == "auto":
device = auto_config_device()
Expand Down Expand Up @@ -791,14 +795,35 @@ def run_bench_serving(
disable_ignore_eos=disable_ignore_eos,
seed=seed,
device=device,
lora_name=lora_name,
)

try:
async def _run():
if need_warmup:
warmup_args = copy.deepcopy(args)
warmup_args.num_prompts = 16
run_benchmark(warmup_args)
res = run_benchmark(args)
await asyncio.to_thread(run_benchmark, warmup_args)

start_event = asyncio.Event()
stop_event = asyncio.Event()
task_handle = (
asyncio.create_task(background_task(base_url, start_event, stop_event))
if background_task
else None
)

try:
start_event.set()
result = await asyncio.to_thread(run_benchmark, args)
finally:
if task_handle:
stop_event.set()
await task_handle

return result

try:
res = asyncio.run(_run())
finally:
kill_process_tree(process.pid)

Expand Down
156 changes: 140 additions & 16 deletions test/srt/test_bench_serving.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,9 @@
import asyncio
import itertools
import unittest
from random import random, uniform

import requests

from sglang.test.test_utils import (
DEFAULT_EAGLE_DRAFT_MODEL_FOR_TEST,
Expand All @@ -16,7 +21,6 @@


class TestBenchServing(CustomTestCase):

def test_offline_throughput_default(self):
res = run_bench_serving(
model=DEFAULT_MODEL_NAME_FOR_TEST,
Expand All @@ -28,7 +32,7 @@ def test_offline_throughput_default(self):
if is_in_ci():
write_github_step_summary(
f"### test_offline_throughput_default\n"
f'Output throughput: {res["output_throughput"]:.2f} token/s\n'
f"Output throughput: {res['output_throughput']:.2f} token/s\n"
)
if is_in_amd_ci():
self.assertGreater(res["output_throughput"], 3050)
Expand All @@ -51,7 +55,7 @@ def test_offline_throughput_non_stream_small_batch_size(self):
if is_in_ci():
write_github_step_summary(
f"### test_offline_throughput_non_stream_small_batch_size\n"
f'Output throughput: {res["output_throughput"]:.2f} token/s\n'
f"Output throughput: {res['output_throughput']:.2f} token/s\n"
)
self.assertGreater(res["output_throughput"], 1050)

Expand All @@ -66,7 +70,7 @@ def test_offline_throughput_without_radix_cache(self):
if is_in_ci():
write_github_step_summary(
f"### test_offline_throughput_without_radix_cache\n"
f'Output throughput: {res["output_throughput"]:.2f} token/s\n'
f"Output throughput: {res['output_throughput']:.2f} token/s\n"
)
if is_in_amd_ci():
self.assertGreater(res["output_throughput"], 3050)
Expand All @@ -84,7 +88,7 @@ def test_offline_throughput_without_chunked_prefill(self):
if is_in_ci():
write_github_step_summary(
f"### test_offline_throughput_without_chunked_prefill\n"
f'Output throughput: {res["output_throughput"]:.2f} token/s\n'
f"Output throughput: {res['output_throughput']:.2f} token/s\n"
)
self.assertGreater(res["output_throughput"], 2600)

Expand All @@ -104,7 +108,7 @@ def test_offline_throughput_with_triton_attention_backend(self):
if is_in_ci():
write_github_step_summary(
f"### test_offline_throughput_with_triton_attention_backend\n"
f'Output throughput: {res["output_throughput"]:.2f} token/s\n'
f"Output throughput: {res['output_throughput']:.2f} token/s\n"
)
if is_in_amd_ci():
self.assertGreater(res["output_throughput"], 3500)
Expand All @@ -122,7 +126,7 @@ def test_offline_throughput_default_fp8(self):
if is_in_ci():
write_github_step_summary(
f"### test_offline_throughput_default_fp8\n"
f'Output throughput: {res["output_throughput"]:.2f} token/s\n'
f"Output throughput: {res['output_throughput']:.2f} token/s\n"
)
if is_in_amd_ci():
self.assertGreater(res["output_throughput"], 3500)
Expand All @@ -140,7 +144,7 @@ def test_online_latency_default(self):
if is_in_ci():
write_github_step_summary(
f"### test_online_latency_default\n"
f'median_e2e_latency_ms: {res["median_e2e_latency_ms"]:.2f} ms\n'
f"median_e2e_latency_ms: {res['median_e2e_latency_ms']:.2f} ms\n"
)
self.assertLess(res["median_e2e_latency_ms"], 11000)
if is_in_amd_ci():
Expand All @@ -164,7 +168,7 @@ def test_vlm_offline_throughput(self):
if is_in_ci():
write_github_step_summary(
f"### test_vlm_offline_throughput\n"
f'Output throughput: {res["output_throughput"]:.2f} token/s\n'
f"Output throughput: {res['output_throughput']:.2f} token/s\n"
)
if is_in_amd_ci():
self.assertGreater(res["output_throughput"], 2000)
Expand All @@ -187,7 +191,7 @@ def test_vlm_online_latency(self):
if is_in_ci():
write_github_step_summary(
f"### test_vlm_online_latency\n"
f'median_e2e_latency_ms: {res["median_e2e_latency_ms"]:.2f} ms\n'
f"median_e2e_latency_ms: {res['median_e2e_latency_ms']:.2f} ms\n"
)
self.assertLess(res["median_e2e_latency_ms"], 16500)
if is_in_amd_ci():
Expand All @@ -197,6 +201,126 @@ def test_vlm_online_latency(self):
self.assertLess(res["median_ttft_ms"], 100)
self.assertLess(res["median_itl_ms"], 8)

def test_lora_online_latency(self):
# TODO (lifuhuang): verify LoRA support in AMD.
if is_in_amd_ci():
pass

res = self._run_lora_latency_test(enable_background_task=False)

if is_in_ci():
write_github_step_summary(
f"### test_lora_online_latency\n"
f"median_e2e_latency_ms: {res['median_e2e_latency_ms']:.2f} ms\n"
f"median_ttft_ms: {res['median_ttft_ms']:.2f} ms\n"
)
self.assertLess(res["median_e2e_latency_ms"], 2400)
self.assertLess(res["median_ttft_ms"], 58)

def test_lora_online_latency_with_concurrent_adapter_updates(self):
# TODO (lifuhuang): verify LoRA support in AMD.
if is_in_amd_ci():
pass

res = self._run_lora_latency_test(enable_background_task=True)

if is_in_ci():
write_github_step_summary(
f"### test_lora_online_latency\n"
f"median_e2e_latency_ms: {res['median_e2e_latency_ms']:.2f} ms\n"
f"median_ttft_ms: {res['median_ttft_ms']:.2f} ms\n"
)
self.assertLess(res["median_e2e_latency_ms"], 4000)
# TODO (lifuhuang): This will be fixed by the overlapped LoRA update in a separate PR.
self.assertLess(res["median_ttft_ms"], 1600)

def _run_lora_latency_test(self, enable_background_task: bool):
"""
Run a latency test for LoRA with the specified background task setting.
"""

async def lora_loader_unloader_task(
base_url: str,
start_event: asyncio.Event,
stop_event: asyncio.Event,
):
"""
A background task that repeatedly loads and unloads a LoRA adapter.
"""
await start_event.wait()

path_cycler = itertools.cycle(
[
"pbevan11/llama-3.1-8b-ocr-correction",
"faridlazuarda/valadapt-llama-3.1-8B-it-chinese",
"philschmid/code-llama-3-1-8b-text-to-sql-lora",
]
)
load_url = f"{base_url}/load_lora_adapter"
unload_url = f"{base_url}/unload_lora_adapter"
num_updates = 0

while not stop_event.is_set():
# 1. Load the LoRA adapter
lora_path = next(path_cycler)
response = await asyncio.to_thread(
requests.post,
load_url,
json={"lora_name": lora_path, "lora_path": lora_path},
)
self.assertTrue(
response.ok, f"Failed to load LoRA adapter: {response.text}"
)
num_updates += 1

if stop_event.is_set():
break

# Yield control to allow other tasks to run.
await asyncio.sleep(1)

# 2. Unload the LoRA adapter
response = await asyncio.to_thread(
requests.post,
unload_url,
json={"lora_name": lora_path},
)
self.assertTrue(
response.ok, f"Failed to unload LoRA adapter: {response.text}"
)
num_updates += 1

# Yield control to allow other tasks to run.
await asyncio.sleep(1)

background_task = lora_loader_unloader_task if enable_background_task else None
res = run_bench_serving(
model=DEFAULT_MODEL_NAME_FOR_TEST,
num_prompts=400,
request_rate=8,
other_server_args=[
"--enable-lora",
"--max-loras-per-batch",
"1",
"--disable-radix-cache",
"--random-seed",
"42",
"--mem-fraction-static",
"0.8",
"--lora-paths",
"Nutanix/Meta-Llama-3.1-8B-Instruct_lora_4_alpha_16",
"--max-lora-rank",
"256",
],
dataset_name="random",
random_input_len=256,
random_output_len=256,
lora_name=["Nutanix/Meta-Llama-3.1-8B-Instruct_lora_4_alpha_16"],
background_task=background_task,
)

return res

def test_online_latency_eagle(self):
res = run_bench_serving(
model=DEFAULT_EAGLE_TARGET_MODEL_FOR_TEST,
Expand Down Expand Up @@ -226,8 +350,8 @@ def test_online_latency_eagle(self):
if is_in_ci():
write_github_step_summary(
f"### test_online_latency_eagle\n"
f'median_e2e_latency_ms: {res["median_e2e_latency_ms"]:.2f} ms\n'
f'accept_length: {res["accept_length"]:.2f} \n'
f"median_e2e_latency_ms: {res['median_e2e_latency_ms']:.2f} ms\n"
f"accept_length: {res['accept_length']:.2f} \n"
)
if is_in_amd_ci():
self.assertLess(res["median_e2e_latency_ms"], 1800)
Expand All @@ -246,7 +370,7 @@ def test_moe_offline_throughput_default(self):
if is_in_ci():
write_github_step_summary(
f"### test_moe_offline_throughput_default\n"
f'Output throughput: {res["output_throughput"]:.2f} token/s\n'
f"Output throughput: {res['output_throughput']:.2f} token/s\n"
)
if is_in_amd_ci():
self.assertGreater(res["output_throughput"], 2100)
Expand All @@ -264,7 +388,7 @@ def test_moe_offline_throughput_without_radix_cache(self):
if is_in_ci():
write_github_step_summary(
f"### test_moe_offline_throughput_without_radix_cache\n"
f'Output throughput: {res["output_throughput"]:.2f} token/s\n'
f"Output throughput: {res['output_throughput']:.2f} token/s\n"
)
if is_in_amd_ci():
self.assertGreater(res["output_throughput"], 2100)
Expand All @@ -286,7 +410,7 @@ def test_pp_offline_throughput_default_decode(self):
if is_in_ci():
write_github_step_summary(
f"### test_pp_offline_throughput_default_decode\n"
f'Output throughput: {res["output_throughput"]:.2f} token/s\n'
f"Output throughput: {res['output_throughput']:.2f} token/s\n"
)
self.assertGreater(res["output_throughput"], 6700)

Expand All @@ -311,7 +435,7 @@ def test_pp_long_context_prefill(self):
if is_in_ci():
write_github_step_summary(
f"### test_pp_long_context_latency_prefill\n"
f'input_throughput: {res["input_throughput"]:.2f} ms\n'
f"input_throughput: {res['input_throughput']:.2f} ms\n"
)
self.assertGreater(res["input_throughput"], 4000)

Expand Down
Loading