sgl-project · ch-wan · Nov 7, 2025 · Oct 24, 2025 · Oct 24, 2025 · Oct 24, 2025
diff --git a/python/sglang/srt/server_args.py b/python/sglang/srt/server_args.py
@@ -1394,7 +1394,7 @@ def _handle_moe_kernel_config(self):
         if self.moe_runner_backend == "flashinfer_cutlass":
             assert (
                 self.quantization == "modelopt_fp4"
-            ), "modelopt_fp4 quantization is required for Flashinfer MOE"
+            ), "modelopt_fp4 quantization is required for Flashinfer Cutlass MOE"
             assert self.ep_size in [
                 1,
                 self.tp_size,

diff --git a/python/sglang/test/test_utils.py b/python/sglang/test/test_utils.py
@@ -58,6 +58,7 @@
 
 # NVFP4 models
 DEFAULT_DEEPSEEK_NVFP4_MODEL_FOR_TEST = "nvidia/DeepSeek-V3-0324-FP4"
+DEFAULT_MODEL_NAME_FOR_TEST_MOE_NVFP4 = "nvidia/Qwen3-30B-A3B-FP4"
 
 # FP8 models
 DEFAULT_MODEL_NAME_FOR_TEST_FP8 = "neuralmagic/Meta-Llama-3.1-8B-Instruct-FP8"
@@ -71,6 +72,10 @@
 DEFAULT_MODEL_NAME_FOR_TEST_QWEN_FP8 = "Qwen/Qwen3-1.7B-FP8"
 DEFAULT_MODEL_NAME_FOR_TEST_FP8_WITH_MOE = "gaunernst/DeepSeek-V2-Lite-Chat-FP8"
 
+# MXFP4 models
+# Standard MXFP4 MoE test model
+DEFAULT_MODEL_NAME_FOR_TEST_MXFP4_WITH_MOE = "openai/gpt-oss-20b"
+
 # W8A8 models
 DEFAULT_MODEL_NAME_FOR_TEST_W8A8 = "RedHatAI/Llama-3.2-3B-quantized.w8a8"
 DEFAULT_MODEL_NAME_FOR_TEST_W8A8_WITH_MOE = "nytopop/Qwen3-30B-A3B.w8a8"
@@ -950,7 +955,6 @@ def run_score_benchmark(
     )
 
     async def _run_benchmark():
-
         # Load tokenizer for generating test data
         from sglang.srt.utils.hf_transformers_utils import get_tokenizer
 

diff --git a/test/srt/layers/moe/test_moe_runners.py b/test/srt/layers/moe/test_moe_runners.py
@@ -0,0 +1,193 @@
+import unittest
+from types import SimpleNamespace
+
+from sglang.srt.utils import kill_process_tree
+from sglang.test.run_eval import run_eval
+from sglang.test.test_utils import (
+    DEFAULT_AWQ_MOE_MODEL_NAME_FOR_TEST,
+    DEFAULT_MODEL_NAME_FOR_TEST_FP8_WITH_MOE,
+    DEFAULT_MODEL_NAME_FOR_TEST_MOE_NVFP4,
+    DEFAULT_MODEL_NAME_FOR_TEST_MXFP4_WITH_MOE,
+    DEFAULT_MODEL_NAME_FOR_TEST_W8A8_WITH_MOE,
+    DEFAULT_SMALL_MOE_MODEL_NAME_FOR_TEST_CHAT,
+    DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+    DEFAULT_URL_FOR_TEST,
+    CustomTestCase,
+    popen_launch_server,
+)
+
+
+class TestMoERunner(CustomTestCase):
+    BASE_URL = DEFAULT_URL_FOR_TEST
+    TIMEOUT = DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH
+    DEFAULT_EVAL_KWARGS = {
+        "eval_name": "mmlu",
+        "num_examples": 5,
+        "num_threads": 1,
+    }
+
+    CONFIGS = {
+        "moe_runner_auto": {
+            "model": DEFAULT_SMALL_MOE_MODEL_NAME_FOR_TEST_CHAT,
+            "other_args": [
+                "--trust-remote-code",
+                "--moe-runner-backend",
+                "triton",
+                "--attention-backend",
+                "torch_native",
+                "--sampling-backend",
+                "pytorch",
+            ],
+        },
+        "moe_runner_triton": {
+            "model": DEFAULT_SMALL_MOE_MODEL_NAME_FOR_TEST_CHAT,
+            "other_args": [
+                "--trust-remote-code",
+                "--moe-runner-backend",
+                "triton",
+                "--attention-backend",
+                "torch_native",
+                "--sampling-backend",
+                "pytorch",
+            ],
+        },
+        "moe_runner_triton_kernel": {
+            "model": DEFAULT_SMALL_MOE_MODEL_NAME_FOR_TEST_CHAT,
+            "other_args": [
+                "--trust-remote-code",
+                "--moe-runner-backend",
+                "triton_kernel",
+                "--attention-backend",
+                "torch_native",
+                "--sampling-backend",
+                "pytorch",
+            ],
+        },
+        "moe_runner_flashinfer_cutlass": {
+            "model": DEFAULT_MODEL_NAME_FOR_TEST_MOE_NVFP4,  # requires model with modelopt_fp4 quantization
+            "other_args": [
+                "--trust-remote-code",
+                "--moe-runner-backend",
+                "flashinfer_cutlass",
+                "--attention-backend",
+                "torch_native",
+                "--sampling-backend",
+                "pytorch",
+            ],
+        },
+        "moe_runner_deep_gemm": {
+            "model": DEFAULT_SMALL_MOE_MODEL_NAME_FOR_TEST_CHAT,
+            "other_args": [
+                "--trust-remote-code",
+                "--moe-runner-backend",
+                "deep_gemm",
+                "--attention-backend",
+                "torch_native",
+                "--sampling-backend",
+                "pytorch",
+            ],
+        },
+        "moe_runner_flashinfer_trtllm": {
+            "model": DEFAULT_MODEL_NAME_FOR_TEST_FP8_WITH_MOE,  # modelopt_fp4 or fp8 quantization is required for Flashinfer trtllm MOE
+            "other_args": [
+                "--trust-remote-code",
+                "--moe-runner-backend",
+                "flashinfer_trtllm",
+            ],
+        },
+        "moe_runner_flashinfer_mxfp4": {
+            "model": DEFAULT_MODEL_NAME_FOR_TEST_MXFP4_WITH_MOE,
+            "other_args": [
+                "--trust-remote-code",
+                "--moe-runner-backend",
+                "flashinfer_mxfp4",
+                "--quantization",
+                "mxfp4",
+                "--attention-backend",
+                "torch_native",
+                "--sampling-backend",
+                "pytorch",
+            ],
+        },
+        "moe_runner_flashinfer_cutedsl": {
+            "model": DEFAULT_MODEL_NAME_FOR_TEST_MOE_NVFP4,
+            "other_args": [
+                "--trust-remote-code",
+                "--moe-runner-backend",
+                "flashinfer_cutedsl",
+                "--attention-backend",
+                "torch_native",
+                "--sampling-backend",
+                "pytorch",
+            ],
+        },
+        "moe_runner_cutlass": {
+            "model": DEFAULT_MODEL_NAME_FOR_TEST_MOE_NVFP4,
+            "other_args": [
+                "--trust-remote-code",
+                "--moe-runner-backend",
+                "cutlass",
+                "--attention-backend",
+                "torch_native",
+                "--sampling-backend",
+                "pytorch",
+            ],
+        },
+        "moe_runner_speculative": {
+            "model": DEFAULT_SMALL_MOE_MODEL_NAME_FOR_TEST_CHAT,
+            "other_args": [
+                "--trust-remote-code",
+                "--moe-runner-backend",
+                "triton",
+                "--speculative-algorithm",
+                "EAGLE",
+                "--speculative-draft-model-path",
+                DEFAULT_SMALL_MOE_MODEL_NAME_FOR_TEST_CHAT,
+                "--speculative-moe-runner-backend",
+                "triton",
+                "--speculative-num-steps",
+                "2",
+                "--speculative-num-draft-tokens",
+                "4",
+                "--attention-backend",
+                "torch_native",
+                "--sampling-backend",
+                "pytorch",
+            ],
+        },
+    }
+
+    def _run_config(self, config: dict) -> None:
+        model = config["model"]
+        other_args = config.get("other_args", [])
+        eval_kwargs = self.DEFAULT_EVAL_KWARGS
+
+        process = popen_launch_server(
+            model,
+            self.BASE_URL,
+            timeout=self.TIMEOUT,
+            other_args=other_args,
+        )
+        try:
+            args = SimpleNamespace(
+                base_url=self.BASE_URL,
+                model=model,
+                **eval_kwargs,
+            )
+            metrics = run_eval(args)
+            print(f"{metrics=}")
+            self.assertGreaterEqual(metrics["score"], 0.48)
+        finally:
+            kill_process_tree(process.pid)
+
+
+for _name, _cfg in TestMoERunner.CONFIGS.items():
+    setattr(
+        TestMoERunner,
+        f"test_{_name}",
+        (lambda self, cfg=_cfg: self._run_config(cfg)),
+    )
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/srt/run_suite.py b/test/srt/run_suite.py
@@ -240,6 +240,7 @@ class TestFile:
         TestFile("hicache/test_hicache_storage_benchmark.py"),
         TestFile("hicache/test_hicache_storage_e2e.py"),
         TestFile("layers/attention/nsa/test_act_quant_triton.py"),
+        TestFile("layers/moe/test_moe_runners.py"),
         TestFile("lora/test_chunked_sgmv_backend.py"),
         TestFile("lora/test_lora_llama4.py"),
         TestFile("models/lora/test_lora.py"),