sgl-project · ispobock · Dec 10, 2025 · Dec 10, 2025 · Dec 10, 2025 · Dec 10, 2025
@@ -7,7 +7,7 @@
 _is_cuda = is_cuda()
 
 if _is_cuda:
-    from sgl_kernel import silu_and_mul
+    from sgl_kernel import moe_sum_reduce, silu_and_mul
 
 
 def get_scalar_type(num_bits: int, has_zp: bool):
@@ -204,9 +204,15 @@ def fused_marlin_moe(
     ).view(-1, topk, K)
 
     output = hidden_states if inplace else torch.empty_like(hidden_states)
-    torch.sum(intermediate_cache3.view(*intermediate_cache3.shape), dim=1, out=output)
-    if routed_scaling_factor is not None:
-        output *= routed_scaling_factor
+
+    if routed_scaling_factor is None:
+        routed_scaling_factor = 1.0
+
+    moe_sum_reduce(
+        intermediate_cache3,
+        output,
+        routed_scaling_factor,
+    )
     return output