Fix LTX-2 Inference when num_videos_per_prompt > 1 and CFG is Enabled (#13121)

dg845 · web-flow · commit 985d83c948ab · 2026-02-11T22:35:29.000-08:00
Fix LTX-2 inference when num_videos_per_prompt &gt; 1 and CFG is enabled
diff --git a/src/diffusers/models/transformers/transformer_ltx2.py b/src/diffusers/models/transformers/transformer_ltx2.py
@@ -56,10 +56,8 @@ def apply_split_rotary_emb(x: torch.Tensor, freqs: Tuple[torch.Tensor, torch.Ten
     x_dtype = x.dtype
     needs_reshape = False
     if x.ndim != 4 and cos.ndim == 4:
-        # cos is (#b, h, t, r) -> reshape x to (b, h, t, dim_per_head)
-        # The cos/sin batch dim may only be broadcastable, so take batch size from x
-        b = x.shape[0]
-        _, h, t, _ = cos.shape
+        # cos is (b, h, t, r) -> reshape x to (b, h, t, dim_per_head)
+        b, h, t, _ = cos.shape
         x = x.reshape(b, t, h, -1).swapaxes(1, 2)
         needs_reshape = True
 
diff --git a/src/diffusers/pipelines/ltx2/pipeline_ltx2.py b/src/diffusers/pipelines/ltx2/pipeline_ltx2.py
@@ -1081,6 +1081,10 @@ def __call__(
         audio_coords = self.transformer.audio_rope.prepare_audio_coords(
             audio_latents.shape[0], audio_num_frames, audio_latents.device
         )
+        # Duplicate the positional ids as well if using CFG
+        if self.do_classifier_free_guidance:
+            video_coords = video_coords.repeat((2,) + (1,) * (video_coords.ndim - 1))  # Repeat twice in batch dim
+            audio_coords = audio_coords.repeat((2,) + (1,) * (audio_coords.ndim - 1))
 
         # 7. Denoising loop
         with self.progress_bar(total=num_inference_steps) as progress_bar:
diff --git a/src/diffusers/pipelines/ltx2/pipeline_ltx2_image2video.py b/src/diffusers/pipelines/ltx2/pipeline_ltx2_image2video.py
@@ -1139,6 +1139,10 @@ def __call__(
         audio_coords = self.transformer.audio_rope.prepare_audio_coords(
             audio_latents.shape[0], audio_num_frames, audio_latents.device
         )
+        # Duplicate the positional ids as well if using CFG
+        if self.do_classifier_free_guidance:
+            video_coords = video_coords.repeat((2,) + (1,) * (video_coords.ndim - 1))  # Repeat twice in batch dim
+            audio_coords = audio_coords.repeat((2,) + (1,) * (audio_coords.ndim - 1))
 
         # 7. Denoising loop
         with self.progress_bar(total=num_inference_steps) as progress_bar: