Skip to content

Commit ab6040a

Browse files
dg845sayakpaulasomoza
authored
Add LTX2 Condition Pipeline (#13058)
* LTX2 condition pipeline initial commit * Fix pipeline import error * Implement LTX-2-style general image conditioning * Blend denoising output and clean latents in sample space instead of velocity space * make style and make quality * make fix-copies * Rename LTX2VideoCondition image to frames * Update LTX2ConditionPipeline example * Remove support for image and video in __call__ * Put latent_idx_from_index logic inline * Improve comment on using the conditioning mask in denoising loop * Apply suggestions from code review Co-authored-by: Álvaro Somoza <asomoza@users.noreply.github.com> * make fix-copies * Migrate to Python 3.9+ style type annotations without explicit typing imports * Forward kwargs from preprocess/postprocess_video to preprocess/postprocess resp. * Center crop LTX-2 conditions following original code * Duplicate video and audio position ids if using CFG * make style and make quality * Remove unused index_type arg to preprocess_conditions * Add # Copied from for _normalize_latents * Fix _normalize_latents # Copied from statement * Add LTX-2 condition pipeline docs * Remove TODOs * Support only unpacked latents (5D for video, 4D for audio) * Remove # Copied from for prepare_audio_latents --------- Co-authored-by: Sayak Paul <spsayakpaul@gmail.com> Co-authored-by: Álvaro Somoza <asomoza@users.noreply.github.com>
1 parent 20364fe commit ab6040a

File tree

7 files changed

+1690
-8
lines changed

7 files changed

+1690
-8
lines changed

docs/source/en/api/pipelines/ltx2.md

Lines changed: 179 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -193,6 +193,179 @@ encode_video(
193193
)
194194
```
195195

196+
## Condition Pipeline Generation
197+
198+
You can use `LTX2ConditionPipeline` to specify image and/or video conditions at arbitrary latent indices. For example, we can specify both a first-frame and last-frame condition to perform first-last-frame-to-video (FLF2V) generation:
199+
200+
```py
201+
import torch
202+
from diffusers import LTX2ConditionPipeline, LTX2LatentUpsamplePipeline
203+
from diffusers.pipelines.ltx2.latent_upsampler import LTX2LatentUpsamplerModel
204+
from diffusers.pipelines.ltx2.pipeline_ltx2_condition import LTX2VideoCondition
205+
from diffusers.pipelines.ltx2.utils import DISTILLED_SIGMA_VALUES, STAGE_2_DISTILLED_SIGMA_VALUES
206+
from diffusers.pipelines.ltx2.export_utils import encode_video
207+
from diffusers.utils import load_image
208+
209+
device = "cuda"
210+
width = 768
211+
height = 512
212+
random_seed = 42
213+
generator = torch.Generator(device).manual_seed(random_seed)
214+
model_path = "rootonchair/LTX-2-19b-distilled"
215+
216+
pipe = LTX2ConditionPipeline.from_pretrained(model_path, torch_dtype=torch.bfloat16)
217+
pipe.enable_sequential_cpu_offload(device=device)
218+
pipe.vae.enable_tiling()
219+
220+
prompt = (
221+
"CG animation style, a small blue bird takes off from the ground, flapping its wings. The bird's feathers are "
222+
"delicate, with a unique pattern on its chest. The background shows a blue sky with white clouds under bright "
223+
"sunshine. The camera follows the bird upward, capturing its flight and the vastness of the sky from a close-up, "
224+
"low-angle perspective."
225+
)
226+
227+
first_image = load_image(
228+
"https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/flf2v_input_first_frame.png",
229+
)
230+
last_image = load_image(
231+
"https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/flf2v_input_last_frame.png",
232+
)
233+
first_cond = LTX2VideoCondition(frames=first_image, index=0, strength=1.0)
234+
last_cond = LTX2VideoCondition(frames=last_image, index=-1, strength=1.0)
235+
conditions = [first_cond, last_cond]
236+
237+
frame_rate = 24.0
238+
video_latent, audio_latent = pipe(
239+
conditions=conditions,
240+
prompt=prompt,
241+
width=width,
242+
height=height,
243+
num_frames=121,
244+
frame_rate=frame_rate,
245+
num_inference_steps=8,
246+
sigmas=DISTILLED_SIGMA_VALUES,
247+
guidance_scale=1.0,
248+
generator=generator,
249+
output_type="latent",
250+
return_dict=False,
251+
)
252+
253+
latent_upsampler = LTX2LatentUpsamplerModel.from_pretrained(
254+
model_path,
255+
subfolder="latent_upsampler",
256+
torch_dtype=torch.bfloat16,
257+
)
258+
upsample_pipe = LTX2LatentUpsamplePipeline(vae=pipe.vae, latent_upsampler=latent_upsampler)
259+
upsample_pipe.enable_model_cpu_offload(device=device)
260+
upscaled_video_latent = upsample_pipe(
261+
latents=video_latent,
262+
output_type="latent",
263+
return_dict=False,
264+
)[0]
265+
266+
video, audio = pipe(
267+
latents=upscaled_video_latent,
268+
audio_latents=audio_latent,
269+
prompt=prompt,
270+
width=width * 2,
271+
height=height * 2,
272+
num_inference_steps=3,
273+
sigmas=STAGE_2_DISTILLED_SIGMA_VALUES,
274+
generator=generator,
275+
guidance_scale=1.0,
276+
output_type="np",
277+
return_dict=False,
278+
)
279+
280+
encode_video(
281+
video[0],
282+
fps=frame_rate,
283+
audio=audio[0].float().cpu(),
284+
audio_sample_rate=pipe.vocoder.config.output_sampling_rate,
285+
output_path="ltx2_distilled_flf2v.mp4",
286+
)
287+
```
288+
289+
You can use both image and video conditions:
290+
291+
```py
292+
import torch
293+
from diffusers import LTX2ConditionPipeline
294+
from diffusers.pipelines.ltx2.pipeline_ltx2_condition import LTX2VideoCondition
295+
from diffusers.pipelines.ltx2.export_utils import encode_video
296+
from diffusers.utils import load_image, load_video
297+
298+
device = "cuda"
299+
width = 768
300+
height = 512
301+
random_seed = 42
302+
generator = torch.Generator(device).manual_seed(random_seed)
303+
model_path = "rootonchair/LTX-2-19b-distilled"
304+
305+
pipe = LTX2ConditionPipeline.from_pretrained(model_path, torch_dtype=torch.bfloat16)
306+
pipe.enable_sequential_cpu_offload(device=device)
307+
pipe.vae.enable_tiling()
308+
309+
prompt = (
310+
"The video depicts a long, straight highway stretching into the distance, flanked by metal guardrails. The road is "
311+
"divided into multiple lanes, with a few vehicles visible in the far distance. The surrounding landscape features "
312+
"dry, grassy fields on one side and rolling hills on the other. The sky is mostly clear with a few scattered "
313+
"clouds, suggesting a bright, sunny day. And then the camera switch to a winding mountain road covered in snow, "
314+
"with a single vehicle traveling along it. The road is flanked by steep, rocky cliffs and sparse vegetation. The "
315+
"landscape is characterized by rugged terrain and a river visible in the distance. The scene captures the "
316+
"solitude and beauty of a winter drive through a mountainous region."
317+
)
318+
negative_prompt = (
319+
"blurry, out of focus, overexposed, underexposed, low contrast, washed out colors, excessive noise, "
320+
"grainy texture, poor lighting, flickering, motion blur, distorted proportions, unnatural skin tones, "
321+
"deformed facial features, asymmetrical face, missing facial features, extra limbs, disfigured hands, "
322+
"wrong hand count, artifacts around text, inconsistent perspective, camera shake, incorrect depth of "
323+
"field, background too sharp, background clutter, distracting reflections, harsh shadows, inconsistent "
324+
"lighting direction, color banding, cartoonish rendering, 3D CGI look, unrealistic materials, uncanny "
325+
"valley effect, incorrect ethnicity, wrong gender, exaggerated expressions, wrong gaze direction, "
326+
"mismatched lip sync, silent or muted audio, distorted voice, robotic voice, echo, background noise, "
327+
"off-sync audio, incorrect dialogue, added dialogue, repetitive speech, jittery movement, awkward "
328+
"pauses, incorrect timing, unnatural transitions, inconsistent framing, tilted camera, flat lighting, "
329+
"inconsistent tone, cinematic oversaturation, stylized filters, or AI artifacts."
330+
)
331+
332+
cond_video = load_video(
333+
"https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/cosmos/cosmos-video2world-input-vid.mp4"
334+
)
335+
cond_image = load_image(
336+
"https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/cosmos/cosmos-video2world-input.jpg"
337+
)
338+
video_cond = LTX2VideoCondition(frames=cond_video, index=0, strength=1.0)
339+
image_cond = LTX2VideoCondition(frames=cond_image, index=8, strength=1.0)
340+
conditions = [video_cond, image_cond]
341+
342+
frame_rate = 24.0
343+
video, audio = pipe(
344+
conditions=conditions,
345+
prompt=prompt,
346+
negative_prompt=negative_prompt,
347+
width=width,
348+
height=height,
349+
num_frames=121,
350+
frame_rate=frame_rate,
351+
num_inference_steps=40,
352+
guidance_scale=4.0,
353+
generator=generator,
354+
output_type="np",
355+
return_dict=False,
356+
)
357+
358+
encode_video(
359+
video[0],
360+
fps=frame_rate,
361+
audio=audio[0].float().cpu(),
362+
audio_sample_rate=pipe.vocoder.config.output_sampling_rate,
363+
output_path="ltx2_cond_video.mp4",
364+
)
365+
```
366+
367+
Because the conditioning is done via latent frames, the 8 data space frames corresponding to the specified latent frame for an image condition will tend to be static.
368+
196369
## LTX2Pipeline
197370

198371
[[autodoc]] LTX2Pipeline
@@ -205,6 +378,12 @@ encode_video(
205378
- all
206379
- __call__
207380

381+
## LTX2ConditionPipeline
382+
383+
[[autodoc]] LTX2ConditionPipeline
384+
- all
385+
- __call__
386+
208387
## LTX2LatentUpsamplePipeline
209388

210389
[[autodoc]] LTX2LatentUpsamplePipeline

src/diffusers/__init__.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -571,6 +571,7 @@
571571
"LEditsPPPipelineStableDiffusionXL",
572572
"LongCatImageEditPipeline",
573573
"LongCatImagePipeline",
574+
"LTX2ConditionPipeline",
574575
"LTX2ImageToVideoPipeline",
575576
"LTX2LatentUpsamplePipeline",
576577
"LTX2Pipeline",
@@ -1318,6 +1319,7 @@
13181319
LEditsPPPipelineStableDiffusionXL,
13191320
LongCatImageEditPipeline,
13201321
LongCatImagePipeline,
1322+
LTX2ConditionPipeline,
13211323
LTX2ImageToVideoPipeline,
13221324
LTX2LatentUpsamplePipeline,
13231325
LTX2Pipeline,

src/diffusers/pipelines/__init__.py

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -292,7 +292,12 @@
292292
"LTXLatentUpsamplePipeline",
293293
"LTXI2VLongMultiPromptPipeline",
294294
]
295-
_import_structure["ltx2"] = ["LTX2Pipeline", "LTX2ImageToVideoPipeline", "LTX2LatentUpsamplePipeline"]
295+
_import_structure["ltx2"] = [
296+
"LTX2Pipeline",
297+
"LTX2ConditionPipeline",
298+
"LTX2ImageToVideoPipeline",
299+
"LTX2LatentUpsamplePipeline",
300+
]
296301
_import_structure["lumina"] = ["LuminaPipeline", "LuminaText2ImgPipeline"]
297302
_import_structure["lumina2"] = ["Lumina2Pipeline", "Lumina2Text2ImgPipeline"]
298303
_import_structure["lucy"] = ["LucyEditPipeline"]
@@ -731,7 +736,7 @@
731736
LTXLatentUpsamplePipeline,
732737
LTXPipeline,
733738
)
734-
from .ltx2 import LTX2ImageToVideoPipeline, LTX2LatentUpsamplePipeline, LTX2Pipeline
739+
from .ltx2 import LTX2ConditionPipeline, LTX2ImageToVideoPipeline, LTX2LatentUpsamplePipeline, LTX2Pipeline
735740
from .lucy import LucyEditPipeline
736741
from .lumina import LuminaPipeline, LuminaText2ImgPipeline
737742
from .lumina2 import Lumina2Pipeline, Lumina2Text2ImgPipeline

src/diffusers/pipelines/ltx2/__init__.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@
2525
_import_structure["connectors"] = ["LTX2TextConnectors"]
2626
_import_structure["latent_upsampler"] = ["LTX2LatentUpsamplerModel"]
2727
_import_structure["pipeline_ltx2"] = ["LTX2Pipeline"]
28+
_import_structure["pipeline_ltx2_condition"] = ["LTX2ConditionPipeline"]
2829
_import_structure["pipeline_ltx2_image2video"] = ["LTX2ImageToVideoPipeline"]
2930
_import_structure["pipeline_ltx2_latent_upsample"] = ["LTX2LatentUpsamplePipeline"]
3031
_import_structure["vocoder"] = ["LTX2Vocoder"]
@@ -40,6 +41,7 @@
4041
from .connectors import LTX2TextConnectors
4142
from .latent_upsampler import LTX2LatentUpsamplerModel
4243
from .pipeline_ltx2 import LTX2Pipeline
44+
from .pipeline_ltx2_condition import LTX2ConditionPipeline
4345
from .pipeline_ltx2_image2video import LTX2ImageToVideoPipeline
4446
from .pipeline_ltx2_latent_upsample import LTX2LatentUpsamplePipeline
4547
from .vocoder import LTX2Vocoder

0 commit comments

Comments
 (0)