From f70400cba0cb60e817745a2b670e5526238a5dd7 Mon Sep 17 00:00:00 2001 From: BBuf <1182563586@qq.com> Date: Mon, 28 Apr 2025 23:03:57 +0800 Subject: [PATCH 1/5] fused moe triton tuning script support qwen23 --- benchmark/kernels/fused_moe_triton/README.md | 7 +++++ .../benchmark_torch_compile_fused_moe.py | 31 +++++++++++++++++-- ...nchmark_vllm_vs_sglang_fused_moe_triton.py | 12 ++++++- .../tuning_fused_moe_triton.py | 5 +++ 4 files changed, 52 insertions(+), 3 deletions(-) diff --git a/benchmark/kernels/fused_moe_triton/README.md b/benchmark/kernels/fused_moe_triton/README.md index 83de52e417d0..85d04279053b 100644 --- a/benchmark/kernels/fused_moe_triton/README.md +++ b/benchmark/kernels/fused_moe_triton/README.md @@ -20,6 +20,13 @@ python benchmark/kernels/fused_moe_triton/tuning_fused_moe_triton.py \ --dtype fp8_w8a8 \ --tune +# Tune Qwen3-235B-A22B-FP8 and TP=4 +python benchmark/kernels/fused_moe_triton/tuning_fused_moe_triton.py \ + --model Qwen/Qwen3-235B-A22B-FP8 \ + --tp-size 4 \ + --dtype fp8_w8a8 \ + --tune + # Tune DeepSeek-V3 with FP8, TP=8 and n_share_experts_fusion=8 python benchmark/kernels/fused_moe_triton/tuning_fused_moe_triton.py \ --model deepseek-ai/DeepSeek-V3-0324 \ diff --git a/benchmark/kernels/fused_moe_triton/benchmark_torch_compile_fused_moe.py b/benchmark/kernels/fused_moe_triton/benchmark_torch_compile_fused_moe.py index 8206684e80fc..8fe828d9841c 100644 --- a/benchmark/kernels/fused_moe_triton/benchmark_torch_compile_fused_moe.py +++ b/benchmark/kernels/fused_moe_triton/benchmark_torch_compile_fused_moe.py @@ -30,10 +30,20 @@ def get_model_config(model_name: str, tp_size: int): topk = config.num_experts_per_tok intermediate_size = config.moe_intermediate_size shard_intermediate_size = 2 * intermediate_size // tp_size + elif config.architectures[0] == "Qwen3MoeForCausalLM": + E = config.num_experts + topk = config.num_experts_per_tok + intermediate_size = config.moe_intermediate_size + shard_intermediate_size = 2 * intermediate_size // tp_size elif config.architectures[0] in ["DeepseekV2ForCausalLM", "DeepseekV3ForCausalLM"]: - E = config.n_routed_experts + n_share_fusion_experts = args.n_share_experts_fusion + E = ( + config.n_routed_experts + n_share_fusion_experts + if config.architectures[0] in ["DeepseekV3ForCausalLM"] + else config.n_routed_experts + ) topk = config.num_experts_per_tok - intermediate_size = config.intermediate_size + intermediate_size = config.moe_intermediate_size shard_intermediate_size = 2 * intermediate_size // tp_size elif config.architectures[0] in [ "Grok1ForCausalLM", @@ -51,12 +61,29 @@ def get_model_config(model_name: str, tp_size: int): intermediate_size = config.intermediate_size shard_intermediate_size = 2 * intermediate_size // tp_size + vllm_version_num = ( + vllm.__version_tuple__[0] * 100 + + vllm.__version_tuple__[1] * 10 + + vllm.__version_tuple__[2] + ) + block_shape = None + if ( + hasattr(config, "quantization_config") + and "weight_block_size" in config.quantization_config + ): + block_shape = config.quantization_config["weight_block_size"] + assert len(block_shape) == 2 + assert ( + vllm_version_num >= 66 + ), "Block-wise quantized fp8 fused_moe is only supported for VLLM>=0.6.6.post1" + shape_configs = { "num_experts": E, "topk": topk, "hidden_size": config.hidden_size, "shard_intermediate_size": shard_intermediate_size, "dtype": config.torch_dtype, + "block_shape": block_shape, } print(f"{shape_configs=}") return shape_configs diff --git a/benchmark/kernels/fused_moe_triton/benchmark_vllm_vs_sglang_fused_moe_triton.py b/benchmark/kernels/fused_moe_triton/benchmark_vllm_vs_sglang_fused_moe_triton.py index 0760116fb859..e68c01bd5536 100644 --- a/benchmark/kernels/fused_moe_triton/benchmark_vllm_vs_sglang_fused_moe_triton.py +++ b/benchmark/kernels/fused_moe_triton/benchmark_vllm_vs_sglang_fused_moe_triton.py @@ -30,8 +30,18 @@ def get_model_config(model_name: str, tp_size: int): topk = config.num_experts_per_tok intermediate_size = config.moe_intermediate_size shard_intermediate_size = 2 * intermediate_size // tp_size + elif config.architectures[0] == "Qwen3MoeForCausalLM": + E = config.num_experts + topk = config.num_experts_per_tok + intermediate_size = config.moe_intermediate_size + shard_intermediate_size = 2 * intermediate_size // tp_size elif config.architectures[0] in ["DeepseekV2ForCausalLM", "DeepseekV3ForCausalLM"]: - E = config.n_routed_experts + n_share_fusion_experts = args.n_share_experts_fusion + E = ( + config.n_routed_experts + n_share_fusion_experts + if config.architectures[0] in ["DeepseekV3ForCausalLM"] + else config.n_routed_experts + ) topk = config.num_experts_per_tok intermediate_size = config.moe_intermediate_size shard_intermediate_size = 2 * intermediate_size // tp_size diff --git a/benchmark/kernels/fused_moe_triton/tuning_fused_moe_triton.py b/benchmark/kernels/fused_moe_triton/tuning_fused_moe_triton.py index a84a4a6e2300..342f4fa1aa8b 100644 --- a/benchmark/kernels/fused_moe_triton/tuning_fused_moe_triton.py +++ b/benchmark/kernels/fused_moe_triton/tuning_fused_moe_triton.py @@ -398,6 +398,11 @@ def main(args: argparse.Namespace): topk = config.num_experts_per_tok intermediate_size = config.moe_intermediate_size shard_intermediate_size = 2 * intermediate_size // args.tp_size + elif config.architectures[0] == "Qwen3MoeForCausalLM": + E = config.num_experts + topk = config.num_experts_per_tok + intermediate_size = config.moe_intermediate_size + shard_intermediate_size = 2 * intermediate_size // args.tp_size elif config.architectures[0] in ["DeepseekV2ForCausalLM", "DeepseekV3ForCausalLM"]: n_share_fusion_experts = args.n_share_experts_fusion E = ( From 4d1d63b213e04709c460a41a2d90e7ed7121c92a Mon Sep 17 00:00:00 2001 From: BBuf <1182563586@qq.com> Date: Mon, 28 Apr 2025 23:06:35 +0800 Subject: [PATCH 2/5] ipd --- .../fused_moe_triton/benchmark_torch_compile_fused_moe.py | 7 +------ .../benchmark_vllm_vs_sglang_fused_moe_triton.py | 7 +------ 2 files changed, 2 insertions(+), 12 deletions(-) diff --git a/benchmark/kernels/fused_moe_triton/benchmark_torch_compile_fused_moe.py b/benchmark/kernels/fused_moe_triton/benchmark_torch_compile_fused_moe.py index 8fe828d9841c..1e171df1cfab 100644 --- a/benchmark/kernels/fused_moe_triton/benchmark_torch_compile_fused_moe.py +++ b/benchmark/kernels/fused_moe_triton/benchmark_torch_compile_fused_moe.py @@ -36,12 +36,7 @@ def get_model_config(model_name: str, tp_size: int): intermediate_size = config.moe_intermediate_size shard_intermediate_size = 2 * intermediate_size // tp_size elif config.architectures[0] in ["DeepseekV2ForCausalLM", "DeepseekV3ForCausalLM"]: - n_share_fusion_experts = args.n_share_experts_fusion - E = ( - config.n_routed_experts + n_share_fusion_experts - if config.architectures[0] in ["DeepseekV3ForCausalLM"] - else config.n_routed_experts - ) + E = config.n_routed_experts topk = config.num_experts_per_tok intermediate_size = config.moe_intermediate_size shard_intermediate_size = 2 * intermediate_size // tp_size diff --git a/benchmark/kernels/fused_moe_triton/benchmark_vllm_vs_sglang_fused_moe_triton.py b/benchmark/kernels/fused_moe_triton/benchmark_vllm_vs_sglang_fused_moe_triton.py index e68c01bd5536..111d6c3e8e9c 100644 --- a/benchmark/kernels/fused_moe_triton/benchmark_vllm_vs_sglang_fused_moe_triton.py +++ b/benchmark/kernels/fused_moe_triton/benchmark_vllm_vs_sglang_fused_moe_triton.py @@ -36,12 +36,7 @@ def get_model_config(model_name: str, tp_size: int): intermediate_size = config.moe_intermediate_size shard_intermediate_size = 2 * intermediate_size // tp_size elif config.architectures[0] in ["DeepseekV2ForCausalLM", "DeepseekV3ForCausalLM"]: - n_share_fusion_experts = args.n_share_experts_fusion - E = ( - config.n_routed_experts + n_share_fusion_experts - if config.architectures[0] in ["DeepseekV3ForCausalLM"] - else config.n_routed_experts - ) + E = config.n_routed_experts topk = config.num_experts_per_tok intermediate_size = config.moe_intermediate_size shard_intermediate_size = 2 * intermediate_size // tp_size From deae116ca4416e4a7178787691aef8843dec08c7 Mon Sep 17 00:00:00 2001 From: BBuf <1182563586@qq.com> Date: Mon, 28 Apr 2025 23:08:53 +0800 Subject: [PATCH 3/5] ipd --- .../benchmark_torch_compile_fused_moe.py | 16 ---------------- 1 file changed, 16 deletions(-) diff --git a/benchmark/kernels/fused_moe_triton/benchmark_torch_compile_fused_moe.py b/benchmark/kernels/fused_moe_triton/benchmark_torch_compile_fused_moe.py index 1e171df1cfab..71815903eb45 100644 --- a/benchmark/kernels/fused_moe_triton/benchmark_torch_compile_fused_moe.py +++ b/benchmark/kernels/fused_moe_triton/benchmark_torch_compile_fused_moe.py @@ -56,22 +56,6 @@ def get_model_config(model_name: str, tp_size: int): intermediate_size = config.intermediate_size shard_intermediate_size = 2 * intermediate_size // tp_size - vllm_version_num = ( - vllm.__version_tuple__[0] * 100 - + vllm.__version_tuple__[1] * 10 - + vllm.__version_tuple__[2] - ) - block_shape = None - if ( - hasattr(config, "quantization_config") - and "weight_block_size" in config.quantization_config - ): - block_shape = config.quantization_config["weight_block_size"] - assert len(block_shape) == 2 - assert ( - vllm_version_num >= 66 - ), "Block-wise quantized fp8 fused_moe is only supported for VLLM>=0.6.6.post1" - shape_configs = { "num_experts": E, "topk": topk, From 9665cd60456ed199a951515d8f44088f8bcd92e2 Mon Sep 17 00:00:00 2001 From: BBuf <1182563586@qq.com> Date: Mon, 28 Apr 2025 23:09:35 +0800 Subject: [PATCH 4/5] upd --- .../fused_moe_triton/benchmark_torch_compile_fused_moe.py | 1 - 1 file changed, 1 deletion(-) diff --git a/benchmark/kernels/fused_moe_triton/benchmark_torch_compile_fused_moe.py b/benchmark/kernels/fused_moe_triton/benchmark_torch_compile_fused_moe.py index 71815903eb45..662d6ec27212 100644 --- a/benchmark/kernels/fused_moe_triton/benchmark_torch_compile_fused_moe.py +++ b/benchmark/kernels/fused_moe_triton/benchmark_torch_compile_fused_moe.py @@ -62,7 +62,6 @@ def get_model_config(model_name: str, tp_size: int): "hidden_size": config.hidden_size, "shard_intermediate_size": shard_intermediate_size, "dtype": config.torch_dtype, - "block_shape": block_shape, } print(f"{shape_configs=}") return shape_configs From a0a91b5e617960deb3915638f48f64be669082da Mon Sep 17 00:00:00 2001 From: BBuf <1182563586@qq.com> Date: Mon, 28 Apr 2025 23:12:40 +0800 Subject: [PATCH 5/5] upd --- benchmark/kernels/fused_moe_triton/README.md | 2 +- .../fused_moe_triton/benchmark_torch_compile_fused_moe.py | 8 ++++---- .../benchmark_vllm_vs_sglang_fused_moe_triton.py | 8 ++++---- .../kernels/fused_moe_triton/tuning_fused_moe_triton.py | 8 ++++---- 4 files changed, 13 insertions(+), 13 deletions(-) diff --git a/benchmark/kernels/fused_moe_triton/README.md b/benchmark/kernels/fused_moe_triton/README.md index 85d04279053b..cabc449d244f 100644 --- a/benchmark/kernels/fused_moe_triton/README.md +++ b/benchmark/kernels/fused_moe_triton/README.md @@ -23,7 +23,7 @@ python benchmark/kernels/fused_moe_triton/tuning_fused_moe_triton.py \ # Tune Qwen3-235B-A22B-FP8 and TP=4 python benchmark/kernels/fused_moe_triton/tuning_fused_moe_triton.py \ --model Qwen/Qwen3-235B-A22B-FP8 \ - --tp-size 4 \ + --tp-size 4 \ --dtype fp8_w8a8 \ --tune diff --git a/benchmark/kernels/fused_moe_triton/benchmark_torch_compile_fused_moe.py b/benchmark/kernels/fused_moe_triton/benchmark_torch_compile_fused_moe.py index 662d6ec27212..13c83726c7dd 100644 --- a/benchmark/kernels/fused_moe_triton/benchmark_torch_compile_fused_moe.py +++ b/benchmark/kernels/fused_moe_triton/benchmark_torch_compile_fused_moe.py @@ -31,10 +31,10 @@ def get_model_config(model_name: str, tp_size: int): intermediate_size = config.moe_intermediate_size shard_intermediate_size = 2 * intermediate_size // tp_size elif config.architectures[0] == "Qwen3MoeForCausalLM": - E = config.num_experts - topk = config.num_experts_per_tok - intermediate_size = config.moe_intermediate_size - shard_intermediate_size = 2 * intermediate_size // tp_size + E = config.num_experts + topk = config.num_experts_per_tok + intermediate_size = config.moe_intermediate_size + shard_intermediate_size = 2 * intermediate_size // tp_size elif config.architectures[0] in ["DeepseekV2ForCausalLM", "DeepseekV3ForCausalLM"]: E = config.n_routed_experts topk = config.num_experts_per_tok diff --git a/benchmark/kernels/fused_moe_triton/benchmark_vllm_vs_sglang_fused_moe_triton.py b/benchmark/kernels/fused_moe_triton/benchmark_vllm_vs_sglang_fused_moe_triton.py index 111d6c3e8e9c..85517e4e7c8d 100644 --- a/benchmark/kernels/fused_moe_triton/benchmark_vllm_vs_sglang_fused_moe_triton.py +++ b/benchmark/kernels/fused_moe_triton/benchmark_vllm_vs_sglang_fused_moe_triton.py @@ -31,10 +31,10 @@ def get_model_config(model_name: str, tp_size: int): intermediate_size = config.moe_intermediate_size shard_intermediate_size = 2 * intermediate_size // tp_size elif config.architectures[0] == "Qwen3MoeForCausalLM": - E = config.num_experts - topk = config.num_experts_per_tok - intermediate_size = config.moe_intermediate_size - shard_intermediate_size = 2 * intermediate_size // tp_size + E = config.num_experts + topk = config.num_experts_per_tok + intermediate_size = config.moe_intermediate_size + shard_intermediate_size = 2 * intermediate_size // tp_size elif config.architectures[0] in ["DeepseekV2ForCausalLM", "DeepseekV3ForCausalLM"]: E = config.n_routed_experts topk = config.num_experts_per_tok diff --git a/benchmark/kernels/fused_moe_triton/tuning_fused_moe_triton.py b/benchmark/kernels/fused_moe_triton/tuning_fused_moe_triton.py index 342f4fa1aa8b..7c03a8358697 100644 --- a/benchmark/kernels/fused_moe_triton/tuning_fused_moe_triton.py +++ b/benchmark/kernels/fused_moe_triton/tuning_fused_moe_triton.py @@ -399,10 +399,10 @@ def main(args: argparse.Namespace): intermediate_size = config.moe_intermediate_size shard_intermediate_size = 2 * intermediate_size // args.tp_size elif config.architectures[0] == "Qwen3MoeForCausalLM": - E = config.num_experts - topk = config.num_experts_per_tok - intermediate_size = config.moe_intermediate_size - shard_intermediate_size = 2 * intermediate_size // args.tp_size + E = config.num_experts + topk = config.num_experts_per_tok + intermediate_size = config.moe_intermediate_size + shard_intermediate_size = 2 * intermediate_size // args.tp_size elif config.architectures[0] in ["DeepseekV2ForCausalLM", "DeepseekV3ForCausalLM"]: n_share_fusion_experts = args.n_share_experts_fusion E = (