LTX-2-19B-Distilled
Distilled LTX-2 text-to-video & image-to-video generation model (Diffusers)
This repository hosts the distilled checkpoint of the LTX-2 19B model optimized for fast video generation with fewer diffusion steps. It supports both text-to-video and image-to-video generation using Hugging Faceโs ๐ค Diffusers pipelines.
Text-to-Video Usage
import torch
from diffusers.pipelines.ltx2 import LTX2Pipeline, LTX2LatentUpsamplePipeline
from diffusers.pipelines.ltx2.latent_upsampler import LTX2LatentUpsamplerModel
from diffusers.pipelines.ltx2.utils import DISTILLED_SIGMA_VALUES, STAGE_2_DISTILLED_SIGMA_VALUES
from diffusers.pipelines.ltx2.export_utils import encode_video
device = "cuda"
width = 768
height = 512
random_seed = 42
generator = torch.Generator(device).manual_seed(random_seed)
model_path = "rootonchair/LTX-2-19b-distilled"
pipe = LTX2Pipeline.from_pretrained(
model_path, torch_dtype=torch.bfloat16
)
pipe.enable_sequential_cpu_offload(device=device)
prompt = "A beautiful sunset over the ocean"
negative_prompt = "shaky, glitchy, low quality, worst quality, deformed, distorted, disfigured, motion smear, motion artifacts, fused fingers, bad anatomy, weird hand, ugly, transition, static."
frame_rate = 24.0
video_latent, audio_latent = pipe(
prompt=prompt,
negative_prompt=negative_prompt,
width=width,
height=height,
num_frames=121,
frame_rate=frame_rate,
num_inference_steps=8,
sigmas=DISTILLED_SIGMA_VALUES,
guidance_scale=1.0,
generator=generator,
output_type="latent",
return_dict=False,
)
latent_upsampler = LTX2LatentUpsamplerModel.from_pretrained(
model_path,
subfolder="latent_upsampler",
torch_dtype=torch.bfloat16,
)
upsample_pipe = LTX2LatentUpsamplePipeline(vae=pipe.vae, latent_upsampler=latent_upsampler)
upsample_pipe.enable_model_cpu_offload(device=device)
upscaled_video_latent = upsample_pipe(
latents=video_latent,
output_type="latent",
return_dict=False,
)[0]
video, audio = pipe(
latents=upscaled_video_latent,
audio_latents=audio_latent,
prompt=prompt,
negative_prompt=negative_prompt,
num_inference_steps=3,
noise_scale=STAGE_2_DISTILLED_SIGMA_VALUES[0], # renoise with first sigma value https://github.com/Lightricks/LTX-2/blob/main/packages/ltx-pipelines/src/ltx_pipelines/distilled.py#L178
sigmas=STAGE_2_DISTILLED_SIGMA_VALUES,
generator=generator,
guidance_scale=1.0,
output_type="np",
return_dict=False,
)
video = (video * 255).round().astype("uint8")
video = torch.from_numpy(video)
encode_video(
video[0],
fps=frame_rate,
audio=audio[0].float().cpu(),
audio_sample_rate=pipe.vocoder.config.output_sampling_rate,
output_path="ltx2_distilled_sample.mp4",
)
Image-to-Video Usage
import torch
from diffusers.pipelines.ltx2 import LTX2ImageToVideoPipeline, LTX2LatentUpsamplePipeline
from diffusers.pipelines.ltx2.latent_upsampler import LTX2LatentUpsamplerModel
from diffusers.pipelines.ltx2.utils import DISTILLED_SIGMA_VALUES, STAGE_2_DISTILLED_SIGMA_VALUES
from diffusers.pipelines.ltx2.export_utils import encode_video
from diffusers.utils import load_image
device = "cuda"
width = 768
height = 512
random_seed = 45
generator = torch.Generator(device).manual_seed(random_seed)
model_path = "rootonchair/LTX-2-19b-distilled"
pipe = LTX2ImageToVideoPipeline.from_pretrained(
model_path, torch_dtype=torch.bfloat16
)
pipe.enable_sequential_cpu_offload(device=device)
image = load_image(
"https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/astronaut.jpg"
)
prompt = "An astronaut hatches from a fragile egg on the surface of the Moon, the shell cracking and peeling apart in gentle low-gravity motion. Fine lunar dust lifts and drifts outward with each movement, floating in slow arcs before settling back onto the ground. The astronaut pushes free in a deliberate, weightless motion, small fragments of the egg tumbling and spinning through the air. In the background, the deep darkness of space subtly shifts as stars glide with the camera's movement, emphasizing vast depth and scale. The camera performs a smooth, cinematic slow push-in, with natural parallax between the foreground dust, the astronaut, and the distant starfield. Ultra-realistic detail, physically accurate low-gravity motion, cinematic lighting, and a breath-taking, movie-like shot."
negative_prompt = "shaky, glitchy, low quality, worst quality, deformed, distorted, disfigured, motion smear, motion artifacts, fused fingers, bad anatomy, weird hand, ugly, transition, static."
frame_rate = 24.0
video_latent, audio_latent = pipe(
image=image,
prompt=prompt,
negative_prompt=negative_prompt,
width=width,
height=height,
num_frames=121,
frame_rate=frame_rate,
num_inference_steps=8,
sigmas=DISTILLED_SIGMA_VALUES,
guidance_scale=1.0,
generator=generator,
output_type="latent",
return_dict=False,
)
latent_upsampler = LTX2LatentUpsamplerModel.from_pretrained(
model_path,
subfolder="latent_upsampler",
torch_dtype=torch.bfloat16,
)
upsample_pipe = LTX2LatentUpsamplePipeline(vae=pipe.vae, latent_upsampler=latent_upsampler)
upsample_pipe.enable_model_cpu_offload(device=device)
upscaled_video_latent = upsample_pipe(
latents=video_latent,
output_type="latent",
return_dict=False,
)[0]
video, audio = pipe(
image=image,
latents=upscaled_video_latent,
audio_latents=audio_latent,
prompt=prompt,
negative_prompt=negative_prompt,
width=width * 2,
height=height * 2,
num_inference_steps=3,
noise_scale=STAGE_2_DISTILLED_SIGMA_VALUES[0],
sigmas=STAGE_2_DISTILLED_SIGMA_VALUES,
generator=generator,
guidance_scale=1.0,
output_type="np",
return_dict=False,
)
video = (video * 255).round().astype("uint8")
video = torch.from_numpy(video)
encode_video(
video[0],
fps=frame_rate,
audio=audio[0].float().cpu(),
audio_sample_rate=pipe.vocoder.config.output_sampling_rate,
output_path="image_ltx2_distilled_sample.mp4",
)
- Downloads last month
- 575
Model tree for rootonchair/LTX-2-19b-distilled
Base model
Lightricks/LTX-2