Support piecewise cuda graph for dsv3 fp4 (#15531)

ispobock · web-flow · commit 8fe3e3746832 · 2025-12-21T14:50:32.000+08:00
diff --git a/python/sglang/srt/layers/attention/trtllm_mla_backend.py b/python/sglang/srt/layers/attention/trtllm_mla_backend.py
@@ -12,6 +12,7 @@
 import triton
 import triton.language as tl
 
+from sglang.srt.compilation.piecewise_context_manager import is_in_piecewise_cuda_graph
 from sglang.srt.layers.attention.flashinfer_mla_backend import (
     FlashInferMLAAttnBackend,
     FlashInferMLAMultiStepDraftBackend,
@@ -582,10 +583,11 @@ def init_forward_metadata(self, forward_batch: ForwardBatch):
         ):
             # For extend batch with prefix length > 0, fallback to ragged kernel implemented in flashinfer MLA backend
             # when chunked prefix cache is disabled.
+            # Also fallback to flashinfer MLA backend when in piecewise cuda graph, since it only supports MLA forward mode.
             has_prefix = any(forward_batch.extend_prefix_lens_cpu)
             fallback_to_flashinfer_impl = (
                 self.disable_chunked_prefix_cache and has_prefix
-            )
+            ) or is_in_piecewise_cuda_graph()
             if fallback_to_flashinfer_impl:
                 super().init_forward_metadata(forward_batch)
 
diff --git a/python/sglang/srt/layers/moe/fused_moe_triton/layer.py b/python/sglang/srt/layers/moe/fused_moe_triton/layer.py
@@ -41,7 +41,13 @@
     StandardDispatcher,
     StandardDispatchOutput,
 )
-from sglang.srt.layers.moe.topk import StandardTopKOutput, TopKOutput, TopKOutputChecker
+from sglang.srt.layers.moe.topk import (
+    BypassedTopKOutput,
+    StandardTopKOutput,
+    TopKConfig,
+    TopKOutput,
+    TopKOutputChecker,
+)
 from sglang.srt.layers.moe.utils import RoutingMethodType
 from sglang.srt.layers.quantization.base_config import (
     FusedMoEMethodBase,
@@ -1210,16 +1216,21 @@ def _quantize_hidden_states_fp4(self, hidden_states: torch.Tensor):
         return hs_fp4, hs_sf
 
     def forward(self, hidden_states: torch.Tensor, topk_output: TopKOutput):
+        assert TopKOutputChecker.format_is_bypassed(
+            topk_output
+        ), "Only bypassed topk output is supported for flashinfer fp4 moe"
+
         if is_in_piecewise_cuda_graph():
-            assert TopKOutputChecker.format_is_standard(
-                topk_output
-            ), "Only standard topk output is supported for piecewise cuda graph"
-            return torch.ops.sglang.moe_forward_piecewise_cuda_graph_impl(
-                hidden_states,
-                topk_output.topk_weights,
-                topk_output.topk_ids,
-                topk_output.router_logits,
-                self.layer_id,
+            return (
+                torch.ops.sglang.flashinfer_fp4_moe_forward_piecewise_cuda_graph_impl(
+                    hidden_states,
+                    topk_output.router_logits,
+                    topk_output.topk_config.top_k,
+                    topk_output.topk_config.topk_group,
+                    topk_output.topk_config.num_expert_group,
+                    topk_output.topk_config.correction_bias,
+                    self.layer_id,
+                )
             )
         else:
             return self.forward_impl(hidden_states, topk_output)
@@ -1343,9 +1354,52 @@ def moe_forward_piecewise_cuda_graph_impl_fake(
     return torch.empty_like(hidden_states)
 
 
+def flashinfer_fp4_moe_forward_piecewise_cuda_graph_impl(
+    hidden_states: torch.Tensor,
+    router_logits: torch.Tensor,
+    top_k: int,
+    topk_group: Optional[int],
+    num_expert_group: Optional[int],
+    correction_bias: Optional[torch.Tensor],
+    layer_id: int,
+) -> torch.Tensor:
+    topk_output = BypassedTopKOutput(
+        hidden_states=hidden_states,
+        router_logits=router_logits,
+        topk_config=TopKConfig(
+            top_k=top_k,
+            topk_group=topk_group,
+            num_expert_group=num_expert_group,
+            correction_bias=correction_bias,
+        ),
+    )
+    forward_context = get_forward_context()
+    moe_layer = forward_context.moe_layers[layer_id]
+    return moe_layer.forward_impl(hidden_states, topk_output)
+
+
+def flashinfer_fp4_moe_forward_piecewise_cuda_graph_impl_fake(
+    hidden_states: torch.Tensor,
+    router_logits: torch.Tensor,
+    top_k: int,
+    topk_group: Optional[int],
+    num_expert_group: Optional[int],
+    correction_bias: Optional[torch.Tensor],
+    layer_id: int,
+) -> torch.Tensor:
+    return torch.empty_like(hidden_states)
+
+
 direct_register_custom_op(
     op_name="moe_forward_piecewise_cuda_graph_impl",
     op_func=moe_forward_piecewise_cuda_graph_impl,
     mutates_args=[],
     fake_impl=moe_forward_piecewise_cuda_graph_impl_fake,
 )
+
+direct_register_custom_op(
+    op_name="flashinfer_fp4_moe_forward_piecewise_cuda_graph_impl",
+    op_func=flashinfer_fp4_moe_forward_piecewise_cuda_graph_impl,
+    mutates_args=[],
+    fake_impl=flashinfer_fp4_moe_forward_piecewise_cuda_graph_impl_fake,
+)
diff --git a/python/sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a4_nvfp4.py b/python/sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a4_nvfp4.py
@@ -17,7 +17,6 @@
 )
 from sglang.srt.layers.quantization.modelopt_quant import (
     FLASHINFER_FP4_GEMM_BACKEND,
-    _sglang_fp4_gemm,
     enable_flashinfer_fp4_gemm,
     fp4_quantize,
 )
@@ -154,7 +153,7 @@ def apply_weights(
             w = layer.weight_packed.T
             w_blockscale = layer.weight_scale.T
 
-        out = _sglang_fp4_gemm(
+        out = torch.ops.sglang.fp4_gemm(
             x_fp4,
             w,
             x_blockscale,
diff --git a/python/sglang/srt/layers/quantization/modelopt_quant.py b/python/sglang/srt/layers/quantization/modelopt_quant.py
@@ -1229,7 +1229,7 @@ def apply(
         backend = (
             FLASHINFER_FP4_GEMM_BACKEND if FLASHINFER_FP4_GEMM_BACKEND else "cutlass"
         )
-        out = _sglang_fp4_gemm(
+        out = torch.ops.sglang.fp4_gemm(
             x_fp4,
             w,
             x_scale_interleaved,
diff --git a/python/sglang/srt/models/deepseek_v2.py b/python/sglang/srt/models/deepseek_v2.py
@@ -20,6 +20,7 @@
 import concurrent.futures
 import logging
 import os
+from contextlib import nullcontext
 from enum import IntEnum, auto
 from typing import Any, Dict, Iterable, List, Optional, Tuple, Union
 
@@ -400,6 +401,9 @@ def handle_attention_fa4(attn, forward_batch):
 
 
 def handle_attention_trtllm_mla(attn, forward_batch):
+    if is_in_piecewise_cuda_graph():
+        return AttnForwardMethod.MLA
+
     sum_extend_prefix_lens = _get_sum_extend_prefix_lens(forward_batch)
     if forward_batch.forward_mode.is_extend_without_speculative() and (
         not attn.disable_chunked_prefix_cache or sum_extend_prefix_lens == 0
@@ -3188,7 +3192,13 @@ def forward(
                 normal_end_layer = normal_start_layer = 0
         aux_hidden_states = []
         for i in range(normal_start_layer, normal_end_layer):
-            with get_global_expert_distribution_recorder().with_current_layer(i):
+            # NOTE: torch dynamo does not support graph break in context manager
+            ctx = (
+                nullcontext()
+                if get_global_server_args().enable_piecewise_cuda_graph
+                else get_global_expert_distribution_recorder().with_current_layer(i)
+            )
+            with ctx:
                 if i in self.layers_to_capture:
                     aux_hidden_states.append(hidden_states + residual)
                 layer = self.layers[i]
diff --git a/test/srt/run_suite.py b/test/srt/run_suite.py
@@ -163,7 +163,7 @@
         TestFile("test_disaggregation_dp_attention.py", 155),
     ],
     "per-commit-4-gpu-b200-stage-b": [
-        TestFile("test_deepseek_v3_fp4_4gpu.py", 1800),  # Stage B test
+        TestFile("test_deepseek_v3_fp4_4gpu.py", 2000),  # Stage B test
     ],
     "per-commit-4-gpu-b200": [
         TestFile("test_flash_attention_4.py", 90),
diff --git a/test/srt/test_deepseek_v3_fp4_4gpu.py b/test/srt/test_deepseek_v3_fp4_4gpu.py
@@ -176,5 +176,72 @@ def test_bs_1_speed(self):
         self.assertGreater(speed, 150)
 
 
+class TestDeepseekV3FP4PiecewiseCudaGraph(CustomTestCase):
+    @classmethod
+    def setUpClass(cls):
+        cls.model = FULL_DEEPSEEK_V3_FP4_MODEL_PATH
+        cls.base_url = DEFAULT_URL_FOR_TEST
+        other_args = [
+            "--tp",
+            "4",
+            "--attention-backend",
+            "trtllm_mla",
+            "--moe-runner-backend",
+            "flashinfer_trtllm",
+            "--quantization",
+            "modelopt_fp4",
+            "--enable-piecewise-cuda-graph",
+            "--kv-cache-dtype",
+            "fp8_e4m3",
+            "--model-loader-extra-config",
+            '{"enable_multithread_load": true,"num_threads": 64}',
+        ]
+        cls.process = popen_launch_server(
+            cls.model,
+            cls.base_url,
+            timeout=SERVER_LAUNCH_TIMEOUT,
+            other_args=other_args,
+        )
+
+    @classmethod
+    def tearDownClass(cls):
+        kill_process_tree(cls.process.pid)
+
+    def test_a_gsm8k(
+        self,
+    ):
+        args = SimpleNamespace(
+            num_shots=8,
+            data_path=None,
+            num_questions=1319,
+            parallel=1319,
+            max_new_tokens=512,
+            host="http://127.0.0.1",
+            port=int(self.base_url.split(":")[-1]),
+        )
+        metrics = run_eval_few_shot_gsm8k(args)
+        print(f"{metrics=}")
+
+        if is_in_ci():
+            write_github_step_summary(
+                f"### test_gsm8k (deepseek-v3-fp4)\n" f'{metrics["accuracy"]=:.3f}\n'
+            )
+
+        self.assertGreater(metrics["accuracy"], 0.935)
+
+    def test_bs_1_speed(self):
+        args = BenchArgs(port=int(self.base_url.split(":")[-1]), max_new_tokens=2048)
+        _, speed = send_one_prompt(args)
+
+        print(f"{speed=:.2f}")
+
+        if is_in_ci():
+            write_github_step_summary(
+                f"### test_bs_1_speed (deepseek-v3-fp4)\n" f"{speed=:.2f} token/s\n"
+            )
+
+        self.assertGreater(speed, 120)
+
+
 if __name__ == "__main__":
     unittest.main()

Original file line number	Diff line number	Diff line change
`@@ -1229,7 +1229,7 @@ def apply(`
`1229`	`1229`	`backend = (`
`1230`	`1230`	`FLASHINFER_FP4_GEMM_BACKEND if FLASHINFER_FP4_GEMM_BACKEND else "cutlass"`
`1231`	`1231`	`)`
`1232`		`- out = _sglang_fp4_gemm(`
	`1232`	`+ out = torch.ops.sglang.fp4_gemm(`
`1233`	`1233`	`x_fp4,`
`1234`	`1234`	`w,`
`1235`	`1235`	`x_scale_interleaved,`