diff --git a/python/sglang/srt/model_executor/cuda_graph_runner.py b/python/sglang/srt/model_executor/cuda_graph_runner.py index 8d3f63e15676..5aba9084052c 100644 --- a/python/sglang/srt/model_executor/cuda_graph_runner.py +++ b/python/sglang/srt/model_executor/cuda_graph_runner.py @@ -188,6 +188,11 @@ def __init__(self, model_runner: ModelRunner): self.speculative_algorithm = model_runner.server_args.speculative_algorithm self.tp_size = model_runner.server_args.tp_size self.dp_size = model_runner.server_args.dp_size + self.enable_deepep_moe = model_runner.server_args.enable_deepep_moe + self.moe_dense_fully_dp = ( + model_runner.server_args.enable_deepep_moe + and model_runner.server_args.moe_dense_tp_size == 1 + ) # Batch sizes to capture self.capture_bs, self.compile_bs = get_batch_sizes_to_capture(model_runner) @@ -301,7 +306,9 @@ def model_capture_mode(self): def can_run(self, forward_batch: ForwardBatch): if self.enable_dp_attention or self.enable_sp_layernorm: - total_global_tokens = sum(forward_batch.global_num_tokens_cpu) + reducer = max if self.moe_dense_fully_dp else sum + # DeepEP MoE layers uses a fixed shape with masking instead of gather tokens from DP ranks. + total_global_tokens = reducer(forward_batch.global_num_tokens_cpu) is_bs_supported = forward_batch.can_run_dp_cuda_graph and ( total_global_tokens in self.graphs @@ -484,8 +491,10 @@ def replay_prepare(self, forward_batch: ForwardBatch): # Pad if self.enable_dp_attention or self.enable_sp_layernorm: + reducer = max if self.moe_dense_fully_dp else sum + # DeepEP MoE layers uses a fixed shape with masking instead of gather tokens from DP ranks. index = bisect.bisect_left( - self.capture_bs, sum(forward_batch.global_num_tokens_cpu) + self.capture_bs, reducer(forward_batch.global_num_tokens_cpu) ) else: index = bisect.bisect_left(self.capture_bs, raw_bs)