vllm-project · Gaohan123 · Dec 8, 2025 · Dec 5, 2025 · Dec 5, 2025 · Dec 7, 2025
@@ -342,9 +342,9 @@ def forward(
                     and thinker_tts_embeds.shape[1] == 3
                 ):
                     bos_eos_pad = thinker_tts_embeds.to(text_hidden_states.device).chunk(3, dim=1)  # 3 * [1,1,H]
-                    multimodal_outputs["tts_bos_embed"] = bos_eos_pad[0]
-                    multimodal_outputs["tts_eos_embed"] = bos_eos_pad[1]
-                    multimodal_outputs["tts_pad_embed"] = bos_eos_pad[2]
+                    multimodal_outputs["tts_bos_embed"] = [bos_eos_pad[0]]
+                    multimodal_outputs["tts_eos_embed"] = [bos_eos_pad[1]]
+                    multimodal_outputs["tts_pad_embed"] = [bos_eos_pad[2]]
             except Exception:
                 # Best-effort; absence will be handled by talker with fallbacks
                 pass

@@ -291,6 +291,11 @@ def propose_draft_token_ids(sampled_token_ids):
                         # Case 1: tensor aligned on token dimension
                         if isinstance(v, torch.Tensor) and v.shape[0] == hidden_states_cpu.shape[0]:
                             mm_payload[k] = v.detach().to("cpu")[prev_logits_index : logits_index + 1].contiguous()
+                        elif isinstance(v, torch.Tensor) and v.shape[0] != hidden_states_cpu.shape[0]:
+                            logger.error(
+                                f"Error in merge multimodal outputs: Tensor dimension mismatch, \
+                                          {v.shape} != {hidden_states_cpu.shape} for {k}"
+                            )
                         # Case 2: nested dict of tensors aligned on token dimension (e.g., selected_hidden_layers)
                         elif isinstance(v, dict):
                             sub_dict: dict[str, torch.Tensor] = {}
@@ -302,7 +307,9 @@ def propose_draft_token_ids(sampled_token_ids):
                             if sub_dict:
                                 mm_payload[k] = sub_dict
                         elif isinstance(v, list):
-                            element: torch.Tensor = v[0]
+                            element = v[0]
+                            if isinstance(element, torch.Tensor):
+                                element = element.detach().to("cpu").contiguous()
                             multimodal_outputs[k] = v[1:] if len(v) > 1 else v
                             mm_payload[k] = element
                     except Exception as e: