Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
30 commits
Select commit Hold shift + click to select a range
5d121f8
Implement basic test
Jonahcb Oct 24, 2025
317c4f6
Shard the model across two gpus
Jonahcb Oct 24, 2025
da4285f
Add multi-test support
Jonahcb Oct 24, 2025
4008c42
Add comprehensive test configs
Jonahcb Oct 24, 2025
30a3986
Add comprehensive test configs
Jonahcb Oct 24, 2025
121fcae
Rename moe test file
Jonahcb Oct 24, 2025
d57c7a0
Add spec decoding cases
Jonahcb Oct 26, 2025
9837cea
Merge branch 'main' into moe/comprehensive-moe-integration-tests
Jonahcb Oct 28, 2025
54c0d01
Simplify code
Jonahcb Oct 28, 2025
9dd39ce
Fix config issues
Jonahcb Oct 28, 2025
d092df0
Fix config issues
Jonahcb Oct 28, 2025
ac734f6
Add default mxfp4 moe test model
Jonahcb Oct 28, 2025
34baf32
Add configs for auto backend choosing logic
Jonahcb Oct 28, 2025
f01cb32
Rename file and remove unnecessary configs
Jonahcb Oct 28, 2025
46b34ed
Simplify configs
Jonahcb Oct 28, 2025
53e9b18
Add helpful comments
Jonahcb Oct 28, 2025
3e36425
Correct comment
Jonahcb Oct 28, 2025
c5d6d8b
Merge branch 'main' into moe/comprehensive-moe-integration-tests
Jonahcb Nov 3, 2025
0bac315
Merge branch 'main' into moe/comprehensive-moe-integration-tests
Jonahcb Nov 3, 2025
8081291
Adjust default model for each test case
Jonahcb Oct 28, 2025
e0193ec
Add default moe NVFP4 model name for test
Jonahcb Nov 3, 2025
aa9382a
Wire default NVFP4 moe model into moe integration tests
Jonahcb Nov 3, 2025
3a92ceb
Wire default NVFP4 moe model into moe integration tests configs
Jonahcb Nov 3, 2025
f8e09fa
Remove unncessary args
Jonahcb Nov 3, 2025
1a2a1c5
Merge branch 'main' into moe/comprehensive-moe-integration-tests
Jonahcb Nov 3, 2025
f347ce3
Merge branch 'main' into moe/comprehensive-moe-integration-tests
Jonahcb Nov 4, 2025
4504c2b
Merge branch 'main' into moe/comprehensive-moe-integration-tests
Jonahcb Nov 5, 2025
aad84ef
Add to not_in_ci list
Jonahcb Nov 5, 2025
9cfb0f0
Merge branch 'main' into moe/comprehensive-moe-integration-tests
Jonahcb Nov 6, 2025
81029f0
Add config to test speculative decoding
Jonahcb Nov 6, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion python/sglang/srt/server_args.py
Original file line number Diff line number Diff line change
Expand Up @@ -1394,7 +1394,7 @@ def _handle_moe_kernel_config(self):
if self.moe_runner_backend == "flashinfer_cutlass":
assert (
self.quantization == "modelopt_fp4"
), "modelopt_fp4 quantization is required for Flashinfer MOE"
), "modelopt_fp4 quantization is required for Flashinfer Cutlass MOE"
assert self.ep_size in [
1,
self.tp_size,
Expand Down
6 changes: 5 additions & 1 deletion python/sglang/test/test_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,7 @@

# NVFP4 models
DEFAULT_DEEPSEEK_NVFP4_MODEL_FOR_TEST = "nvidia/DeepSeek-V3-0324-FP4"
DEFAULT_MODEL_NAME_FOR_TEST_MOE_NVFP4 = "nvidia/Qwen3-30B-A3B-FP4"

# FP8 models
DEFAULT_MODEL_NAME_FOR_TEST_FP8 = "neuralmagic/Meta-Llama-3.1-8B-Instruct-FP8"
Expand All @@ -71,6 +72,10 @@
DEFAULT_MODEL_NAME_FOR_TEST_QWEN_FP8 = "Qwen/Qwen3-1.7B-FP8"
DEFAULT_MODEL_NAME_FOR_TEST_FP8_WITH_MOE = "gaunernst/DeepSeek-V2-Lite-Chat-FP8"

# MXFP4 models
# Standard MXFP4 MoE test model
DEFAULT_MODEL_NAME_FOR_TEST_MXFP4_WITH_MOE = "openai/gpt-oss-20b"

# W8A8 models
DEFAULT_MODEL_NAME_FOR_TEST_W8A8 = "RedHatAI/Llama-3.2-3B-quantized.w8a8"
DEFAULT_MODEL_NAME_FOR_TEST_W8A8_WITH_MOE = "nytopop/Qwen3-30B-A3B.w8a8"
Expand Down Expand Up @@ -950,7 +955,6 @@ def run_score_benchmark(
)

async def _run_benchmark():

# Load tokenizer for generating test data
from sglang.srt.utils.hf_transformers_utils import get_tokenizer

Expand Down
193 changes: 193 additions & 0 deletions test/srt/layers/moe/test_moe_runners.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,193 @@
import unittest
from types import SimpleNamespace

from sglang.srt.utils import kill_process_tree
from sglang.test.run_eval import run_eval
from sglang.test.test_utils import (
DEFAULT_AWQ_MOE_MODEL_NAME_FOR_TEST,
DEFAULT_MODEL_NAME_FOR_TEST_FP8_WITH_MOE,
DEFAULT_MODEL_NAME_FOR_TEST_MOE_NVFP4,
DEFAULT_MODEL_NAME_FOR_TEST_MXFP4_WITH_MOE,
DEFAULT_MODEL_NAME_FOR_TEST_W8A8_WITH_MOE,
DEFAULT_SMALL_MOE_MODEL_NAME_FOR_TEST_CHAT,
DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
DEFAULT_URL_FOR_TEST,
CustomTestCase,
popen_launch_server,
)


class TestMoERunner(CustomTestCase):
BASE_URL = DEFAULT_URL_FOR_TEST
TIMEOUT = DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH
DEFAULT_EVAL_KWARGS = {
"eval_name": "mmlu",
"num_examples": 5,
"num_threads": 1,
}

CONFIGS = {
"moe_runner_auto": {
"model": DEFAULT_SMALL_MOE_MODEL_NAME_FOR_TEST_CHAT,
"other_args": [
"--trust-remote-code",
"--moe-runner-backend",
"triton",
"--attention-backend",
"torch_native",
"--sampling-backend",
"pytorch",
],
},
"moe_runner_triton": {
"model": DEFAULT_SMALL_MOE_MODEL_NAME_FOR_TEST_CHAT,
"other_args": [
"--trust-remote-code",
"--moe-runner-backend",
"triton",
"--attention-backend",
"torch_native",
"--sampling-backend",
"pytorch",
],
},
"moe_runner_triton_kernel": {
"model": DEFAULT_SMALL_MOE_MODEL_NAME_FOR_TEST_CHAT,
"other_args": [
"--trust-remote-code",
"--moe-runner-backend",
"triton_kernel",
"--attention-backend",
"torch_native",
"--sampling-backend",
"pytorch",
],
},
"moe_runner_flashinfer_cutlass": {
"model": DEFAULT_MODEL_NAME_FOR_TEST_MOE_NVFP4, # requires model with modelopt_fp4 quantization
"other_args": [
"--trust-remote-code",
"--moe-runner-backend",
"flashinfer_cutlass",
"--attention-backend",
"torch_native",
"--sampling-backend",
"pytorch",
],
},
"moe_runner_deep_gemm": {
"model": DEFAULT_SMALL_MOE_MODEL_NAME_FOR_TEST_CHAT,
"other_args": [
"--trust-remote-code",
"--moe-runner-backend",
"deep_gemm",
"--attention-backend",
"torch_native",
"--sampling-backend",
"pytorch",
],
},
"moe_runner_flashinfer_trtllm": {
"model": DEFAULT_MODEL_NAME_FOR_TEST_FP8_WITH_MOE, # modelopt_fp4 or fp8 quantization is required for Flashinfer trtllm MOE
"other_args": [
"--trust-remote-code",
"--moe-runner-backend",
"flashinfer_trtllm",
],
},
"moe_runner_flashinfer_mxfp4": {
"model": DEFAULT_MODEL_NAME_FOR_TEST_MXFP4_WITH_MOE,
"other_args": [
"--trust-remote-code",
"--moe-runner-backend",
"flashinfer_mxfp4",
"--quantization",
"mxfp4",
"--attention-backend",
"torch_native",
"--sampling-backend",
"pytorch",
],
},
"moe_runner_flashinfer_cutedsl": {
"model": DEFAULT_MODEL_NAME_FOR_TEST_MOE_NVFP4,
"other_args": [
"--trust-remote-code",
"--moe-runner-backend",
"flashinfer_cutedsl",
"--attention-backend",
"torch_native",
"--sampling-backend",
"pytorch",
],
},
"moe_runner_cutlass": {
"model": DEFAULT_MODEL_NAME_FOR_TEST_MOE_NVFP4,
"other_args": [
"--trust-remote-code",
"--moe-runner-backend",
"cutlass",
"--attention-backend",
"torch_native",
"--sampling-backend",
"pytorch",
],
},
"moe_runner_speculative": {
"model": DEFAULT_SMALL_MOE_MODEL_NAME_FOR_TEST_CHAT,
"other_args": [
"--trust-remote-code",
"--moe-runner-backend",
"triton",
"--speculative-algorithm",
"EAGLE",
"--speculative-draft-model-path",
DEFAULT_SMALL_MOE_MODEL_NAME_FOR_TEST_CHAT,
"--speculative-moe-runner-backend",
"triton",
"--speculative-num-steps",
"2",
"--speculative-num-draft-tokens",
"4",
"--attention-backend",
"torch_native",
"--sampling-backend",
"pytorch",
],
},
}

def _run_config(self, config: dict) -> None:
model = config["model"]
other_args = config.get("other_args", [])
eval_kwargs = self.DEFAULT_EVAL_KWARGS

process = popen_launch_server(
model,
self.BASE_URL,
timeout=self.TIMEOUT,
other_args=other_args,
)
try:
args = SimpleNamespace(
base_url=self.BASE_URL,
model=model,
**eval_kwargs,
)
metrics = run_eval(args)
print(f"{metrics=}")
self.assertGreaterEqual(metrics["score"], 0.48)
finally:
kill_process_tree(process.pid)


for _name, _cfg in TestMoERunner.CONFIGS.items():
setattr(
TestMoERunner,
f"test_{_name}",
(lambda self, cfg=_cfg: self._run_config(cfg)),
)


if __name__ == "__main__":
unittest.main()
1 change: 1 addition & 0 deletions test/srt/run_suite.py
Original file line number Diff line number Diff line change
Expand Up @@ -240,6 +240,7 @@ class TestFile:
TestFile("hicache/test_hicache_storage_benchmark.py"),
TestFile("hicache/test_hicache_storage_e2e.py"),
TestFile("layers/attention/nsa/test_act_quant_triton.py"),
TestFile("layers/moe/test_moe_runners.py"),
TestFile("lora/test_chunked_sgmv_backend.py"),
TestFile("lora/test_lora_llama4.py"),
TestFile("models/lora/test_lora.py"),
Expand Down
Loading