Initial DiT and VAE changes for LTX inference#783
Initial DiT and VAE changes for LTX inference#783aartilalwani wants to merge 10 commits intohao-ai-lab:mainfrom
Conversation
|
Could you run pre-commit on this PR? |
| ) | ||
|
|
||
| generator = VideoGenerator.from_pretrained( | ||
| model_path="data/Lightricks/LTX-Video", |
There was a problem hiding this comment.
VideoGenerator.from_pretrained() should download the model for you, meaning directly passing Lightricks/LTX-Video as the model_path should work. Could you test and simply this example accordingly? thanks
| # TODO: fix all of the configs so it's exact match | ||
|
|
||
| # # Text encoder configuration | ||
| # text_encoder_configs: tuple[EncoderConfig, ...] = field( | ||
| # # todo: set max length later | ||
| # #def ltx_t5_config(): | ||
| # # config = T5Config() | ||
| # # config.tokenizer_kwargs["max_length"] = 128 | ||
| # # return config | ||
|
|
||
| # # @dataclass | ||
| # # class LTXConfig(PipelineConfig): | ||
| # # text_encoder_configs: tuple[EncoderConfig, ...] = field( | ||
| # # default_factory=lambda: (ltx_t5_config(), )) | ||
| # default_factory=lambda: (T5Config(), )) |
There was a problem hiding this comment.
Are these not needed? if so please remove
| # TODO: load differently for each config | ||
| # Text-to-Video: Only needs the decoder (to decode latents to video) | ||
| # Image-to-Video: Needs both encoder (to encode input image) and decoder | ||
| # @dataclass | ||
| # class LTXT2VConfig(LTXConfig): | ||
| # def __post_init__(self): | ||
| # super().__post_init__() | ||
| # self.vae_config.load_encoder = False | ||
| # self.vae_config.load_decoder = True | ||
|
|
||
| # @dataclass | ||
| # class LTXI2VConfig(LTXConfig): | ||
| # def __post_init__(self): | ||
| # super().__post_init__() | ||
| # self.vae_config.load_encoder = True | ||
| # self.vae_config.load_decoder = True | ||
|
|
There was a problem hiding this comment.
Also here, please clean up comments
| class LTXSamplingParam(SamplingParam): | ||
| # Video parameters | ||
| height: int = 512 | ||
| width: int = 704 | ||
|
|
||
| # Most defaults set in pipeline config | ||
| num_inference_steps: int = 50 |
There was a problem hiding this comment.
What's the default number of frames that the official repo generates? Could you add it here as well?
| from diffusers.utils.torch_utils import maybe_allow_in_graph | ||
| # from ..attention import FeedForward | ||
| from fastvideo.attention import DistributedAttention, LocalAttention | ||
| #from diffusers.attention_processor import Attention |
There was a problem hiding this comment.
clean up comments please
| # Add ImageVAEEncodingStage for I2V (conditional based on input) | ||
| # Before LatentPreparation for I2V | ||
| # if fastvideo_args.pipeline_config.ltx_i2v_mode: | ||
| # self.add_stage( | ||
| # stage_name="image_vae_encoding_stage", | ||
| # stage=LTXImageVAEEncodingStage(vae=self.get_module("vae"))) |
There was a problem hiding this comment.
remove if not needed
|
Closing. This PR has been too long unresolved and irrelevant. |
DiT and VAE changes for LTX inference pipeline, will add more optimizations in upcoming PRs including full pipeline