huggingface
diff --git a/‎docs/source/en/api/pipelines/ltx2.md‎
Lines changed: 179 additions & 0 deletions b/‎docs/source/en/api/pipelines/ltx2.md‎
Lines changed: 179 additions & 0 deletions
diff --git a/‎src/diffusers/__init__.py‎
Lines changed: 2 additions & 0 deletions b/‎src/diffusers/__init__.py‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎src/diffusers/pipelines/__init__.py‎
Lines changed: 7 additions & 2 deletions b/‎src/diffusers/pipelines/__init__.py‎
Lines changed: 7 additions & 2 deletions
diff --git a/‎src/diffusers/pipelines/ltx2/__init__.py‎
Lines changed: 2 additions & 0 deletions b/‎src/diffusers/pipelines/ltx2/__init__.py‎
Lines changed: 2 additions & 0 deletions
@@ -193,6 +193,179 @@ encode_video(
 )
 ```
 
+## Condition Pipeline Generation
+
+You can use `LTX2ConditionPipeline` to specify image and/or video conditions at arbitrary latent indices. For example, we can specify both a first-frame and last-frame condition to perform first-last-frame-to-video (FLF2V) generation:
+
+```py
+import torch
+from diffusers import LTX2ConditionPipeline, LTX2LatentUpsamplePipeline
+from diffusers.pipelines.ltx2.latent_upsampler import LTX2LatentUpsamplerModel
+from diffusers.pipelines.ltx2.pipeline_ltx2_condition import LTX2VideoCondition
+from diffusers.pipelines.ltx2.utils import DISTILLED_SIGMA_VALUES, STAGE_2_DISTILLED_SIGMA_VALUES
+from diffusers.pipelines.ltx2.export_utils import encode_video
+from diffusers.utils import load_image
+
+device = "cuda"
+width = 768
+height = 512
+random_seed = 42
+generator = torch.Generator(device).manual_seed(random_seed)
+model_path = "rootonchair/LTX-2-19b-distilled"
+
+pipe = LTX2ConditionPipeline.from_pretrained(model_path, torch_dtype=torch.bfloat16)
+pipe.enable_sequential_cpu_offload(device=device)
+pipe.vae.enable_tiling()
+
+prompt = (
+    "CG animation style, a small blue bird takes off from the ground, flapping its wings. The bird's feathers are "
+    "delicate, with a unique pattern on its chest. The background shows a blue sky with white clouds under bright "
+    "sunshine. The camera follows the bird upward, capturing its flight and the vastness of the sky from a close-up, "
+    "low-angle perspective."
+)
+
+first_image = load_image(
+    "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/flf2v_input_first_frame.png",
+)
+last_image = load_image(
+    "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/flf2v_input_last_frame.png",
+)
+first_cond = LTX2VideoCondition(frames=first_image, index=0, strength=1.0)
+last_cond = LTX2VideoCondition(frames=last_image, index=-1, strength=1.0)
+conditions = [first_cond, last_cond]
+
+frame_rate = 24.0
+video_latent, audio_latent = pipe(
+    conditions=conditions,
+    prompt=prompt,
+    width=width,
+    height=height,
+    num_frames=121,
+    frame_rate=frame_rate,
+    num_inference_steps=8,
+    sigmas=DISTILLED_SIGMA_VALUES,
+    guidance_scale=1.0,
+    generator=generator,
+    output_type="latent",
+    return_dict=False,
+)
+
+latent_upsampler = LTX2LatentUpsamplerModel.from_pretrained(
+    model_path,
+    subfolder="latent_upsampler",
+    torch_dtype=torch.bfloat16,
+)
+upsample_pipe = LTX2LatentUpsamplePipeline(vae=pipe.vae, latent_upsampler=latent_upsampler)
+upsample_pipe.enable_model_cpu_offload(device=device)
+upscaled_video_latent = upsample_pipe(
+    latents=video_latent,
+    output_type="latent",
+    return_dict=False,
+)[0]
+
+video, audio = pipe(
+    latents=upscaled_video_latent,
+    audio_latents=audio_latent,
+    prompt=prompt,
+    width=width * 2,
+    height=height * 2,
+    num_inference_steps=3,
+    sigmas=STAGE_2_DISTILLED_SIGMA_VALUES,
+    generator=generator,
+    guidance_scale=1.0,
+    output_type="np",
+    return_dict=False,
+)
+
+encode_video(
+    video[0],
+    fps=frame_rate,
+    audio=audio[0].float().cpu(),
+    audio_sample_rate=pipe.vocoder.config.output_sampling_rate,
+    output_path="ltx2_distilled_flf2v.mp4",
+)
+```
+
+You can use both image and video conditions:
+
+```py
+import torch
+from diffusers import LTX2ConditionPipeline
+from diffusers.pipelines.ltx2.pipeline_ltx2_condition import LTX2VideoCondition
+from diffusers.pipelines.ltx2.export_utils import encode_video
+from diffusers.utils import load_image, load_video
+
+device = "cuda"
+width = 768
+height = 512
+random_seed = 42
+generator = torch.Generator(device).manual_seed(random_seed)
+model_path = "rootonchair/LTX-2-19b-distilled"
+
+pipe = LTX2ConditionPipeline.from_pretrained(model_path, torch_dtype=torch.bfloat16)
+pipe.enable_sequential_cpu_offload(device=device)
+pipe.vae.enable_tiling()
+
+prompt = (
+    "The video depicts a long, straight highway stretching into the distance, flanked by metal guardrails. The road is "
+    "divided into multiple lanes, with a few vehicles visible in the far distance. The surrounding landscape features "
+    "dry, grassy fields on one side and rolling hills on the other. The sky is mostly clear with a few scattered "
+    "clouds, suggesting a bright, sunny day. And then the camera switch to a winding mountain road covered in snow, "
+    "with a single vehicle traveling along it. The road is flanked by steep, rocky cliffs and sparse vegetation. The "
+    "landscape is characterized by rugged terrain and a river visible in the distance. The scene captures the "
+    "solitude and beauty of a winter drive through a mountainous region."
+)
+negative_prompt = (
+    "blurry, out of focus, overexposed, underexposed, low contrast, washed out colors, excessive noise, "
+    "grainy texture, poor lighting, flickering, motion blur, distorted proportions, unnatural skin tones, "
+    "deformed facial features, asymmetrical face, missing facial features, extra limbs, disfigured hands, "
+    "wrong hand count, artifacts around text, inconsistent perspective, camera shake, incorrect depth of "
+    "field, background too sharp, background clutter, distracting reflections, harsh shadows, inconsistent "
+    "lighting direction, color banding, cartoonish rendering, 3D CGI look, unrealistic materials, uncanny "
+    "valley effect, incorrect ethnicity, wrong gender, exaggerated expressions, wrong gaze direction, "
+    "mismatched lip sync, silent or muted audio, distorted voice, robotic voice, echo, background noise, "
+    "off-sync audio, incorrect dialogue, added dialogue, repetitive speech, jittery movement, awkward "
+    "pauses, incorrect timing, unnatural transitions, inconsistent framing, tilted camera, flat lighting, "
+    "inconsistent tone, cinematic oversaturation, stylized filters, or AI artifacts."
+)
+
+cond_video = load_video(
+    "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/cosmos/cosmos-video2world-input-vid.mp4"
+)
+cond_image = load_image(
+    "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/cosmos/cosmos-video2world-input.jpg"
+)
+video_cond = LTX2VideoCondition(frames=cond_video, index=0, strength=1.0)
+image_cond = LTX2VideoCondition(frames=cond_image, index=8, strength=1.0)
+conditions = [video_cond, image_cond]
+
+frame_rate = 24.0
+video, audio = pipe(
+    conditions=conditions,
+    prompt=prompt,
+    negative_prompt=negative_prompt,
+    width=width,
+    height=height,
+    num_frames=121,
+    frame_rate=frame_rate,
+    num_inference_steps=40,
+    guidance_scale=4.0,
+    generator=generator,
+    output_type="np",
+    return_dict=False,
+)
+
+encode_video(
+    video[0],
+    fps=frame_rate,
+    audio=audio[0].float().cpu(),
+    audio_sample_rate=pipe.vocoder.config.output_sampling_rate,
+    output_path="ltx2_cond_video.mp4",
+)
+```
+
+Because the conditioning is done via latent frames, the 8 data space frames corresponding to the specified latent frame for an image condition will tend to be static.
+
 ## LTX2Pipeline
 
 [[autodoc]] LTX2Pipeline
@@ -205,6 +378,12 @@ encode_video(
   - all
   - __call__
 
+## LTX2ConditionPipeline
+
+[[autodoc]] LTX2ConditionPipeline
+  - all
+  - __call__
+
 ## LTX2LatentUpsamplePipeline
 
 [[autodoc]] LTX2LatentUpsamplePipeline
 
@@ -571,6 +571,7 @@
             "LEditsPPPipelineStableDiffusionXL",
             "LongCatImageEditPipeline",
             "LongCatImagePipeline",
+            "LTX2ConditionPipeline",
             "LTX2ImageToVideoPipeline",
             "LTX2LatentUpsamplePipeline",
             "LTX2Pipeline",
@@ -1318,6 +1319,7 @@
             LEditsPPPipelineStableDiffusionXL,
             LongCatImageEditPipeline,
             LongCatImagePipeline,
+            LTX2ConditionPipeline,
             LTX2ImageToVideoPipeline,
             LTX2LatentUpsamplePipeline,
             LTX2Pipeline,
 
@@ -292,7 +292,12 @@
         "LTXLatentUpsamplePipeline",
         "LTXI2VLongMultiPromptPipeline",
     ]
-    _import_structure["ltx2"] = ["LTX2Pipeline", "LTX2ImageToVideoPipeline", "LTX2LatentUpsamplePipeline"]
+    _import_structure["ltx2"] = [
+        "LTX2Pipeline",
+        "LTX2ConditionPipeline",
+        "LTX2ImageToVideoPipeline",
+        "LTX2LatentUpsamplePipeline",
+    ]
     _import_structure["lumina"] = ["LuminaPipeline", "LuminaText2ImgPipeline"]
     _import_structure["lumina2"] = ["Lumina2Pipeline", "Lumina2Text2ImgPipeline"]
     _import_structure["lucy"] = ["LucyEditPipeline"]
@@ -731,7 +736,7 @@
             LTXLatentUpsamplePipeline,
             LTXPipeline,
         )
-        from .ltx2 import LTX2ImageToVideoPipeline, LTX2LatentUpsamplePipeline, LTX2Pipeline
+        from .ltx2 import LTX2ConditionPipeline, LTX2ImageToVideoPipeline, LTX2LatentUpsamplePipeline, LTX2Pipeline
         from .lucy import LucyEditPipeline
         from .lumina import LuminaPipeline, LuminaText2ImgPipeline
         from .lumina2 import Lumina2Pipeline, Lumina2Text2ImgPipeline
 
@@ -25,6 +25,7 @@
     _import_structure["connectors"] = ["LTX2TextConnectors"]
     _import_structure["latent_upsampler"] = ["LTX2LatentUpsamplerModel"]
     _import_structure["pipeline_ltx2"] = ["LTX2Pipeline"]
+    _import_structure["pipeline_ltx2_condition"] = ["LTX2ConditionPipeline"]
     _import_structure["pipeline_ltx2_image2video"] = ["LTX2ImageToVideoPipeline"]
     _import_structure["pipeline_ltx2_latent_upsample"] = ["LTX2LatentUpsamplePipeline"]
     _import_structure["vocoder"] = ["LTX2Vocoder"]
@@ -40,6 +41,7 @@
         from .connectors import LTX2TextConnectors
         from .latent_upsampler import LTX2LatentUpsamplerModel
         from .pipeline_ltx2 import LTX2Pipeline
+        from .pipeline_ltx2_condition import LTX2ConditionPipeline
         from .pipeline_ltx2_image2video import LTX2ImageToVideoPipeline
         from .pipeline_ltx2_latent_upsample import LTX2LatentUpsamplePipeline
         from .vocoder import LTX2Vocoder