sgl-project · zhyncs · Jun 26, 2025 · Jun 26, 2025 · gemini-code-assist · Jun 26, 2025
@@ -479,10 +479,6 @@ def __init__(self, *args, **kwargs):
     def on_select_experts(self, layer_idx: int, topk_ids: torch.Tensor):
         topk_ids = topk_ids.flatten()
         mask = topk_ids != -1
-        assert self._data[layer_idx, :].shape == topk_ids.shape, (
-            "Shape mismatch between data and topk_ids."
-            "Selecting expert is not supported for multiple token prediction at the moment."
-        )
         self._data[layer_idx, :].scatter_add_(
             dim=0, index=topk_ids.masked_fill(~mask, 0).long(), src=mask.int()
         )
-        self._data[layer_idx, :].scatter_add_(
-            dim=0, index=topk_ids.masked_fill(~mask, 0).long(), src=mask.int()
-        )
+        mask = topk_ids != -1
+        if topk_ids.numel() > 0:
+            assert topk_ids.max() < self._data.shape[1]
-        self._data[layer_idx, :].scatter_add_(
-            dim=0, index=topk_ids.masked_fill(~mask, 0).long(), src=mask.int()
-        )
+        mask = topk_ids != -1
+        if topk_ids.numel() > 0:
+            assert topk_ids.max() < self._data.shape[1]