Spaces:

rahul7star
/

LTX2_distill

Paused

App Files Files Community

rahul7star commited on 29 days ago

Commit

8f29086

verified ·

1 Parent(s): cf97785

Create app_noaud.py

Browse files

Files changed (1) hide show

app_noaud.py +295 -0

app_noaud.py ADDED Viewed

	@@ -0,0 +1,295 @@

+import spaces
+import gradio as gr
+import torch
+from diffusers import DiffusionPipeline
+from diffusers.utils import load_image, export_to_video
+import random
+import numpy as np
+from moviepy import ImageSequenceClip, AudioFileClip, VideoFileClip
+from PIL import Image, ImageOps
+import os
+# ============================================================
+# 🔥 GLOBAL PERFORMANCE SETTINGS (H200 OPTIMIZED)
+# ============================================================
+torch.backends.cuda.matmul.allow_tf32 = True
+torch.backends.cudnn.allow_tf32 = True
+torch.set_grad_enabled(False)
+torch.backends.cuda.enable_flash_sdp(True)
+torch.backends.cuda.enable_mem_efficient_sdp(True)
+DEVICE = "cuda"
+DTYPE = torch.bfloat16
+# ============================================================
+# 🎯 DISTILLED SIGMAS
+# ============================================================
+DISTILLED_SIGMA_VALUES = [
+    1.0, 0.99375, 0.9875, 0.98125, 0.975, 0.909375, 0.725, 0.421875
+]
+# ============================================================
+# 🚀 LOAD MODEL ON STARTUP (ONLY ONCE)
+# ============================================================
+print("🚀 Loading LTX-2 Distilled on H200...")
+pipe = DiffusionPipeline.from_pretrained(
+    "rootonchair/LTX-2-19b-distilled",
+    custom_pipeline="multimodalart/ltx2-audio-to-video",
+    torch_dtype=DTYPE,
+)
+pipe.to(DEVICE)
+# Enable memory efficient attention
+try:
+    pipe.enable_xformers_memory_efficient_attention()
+    print("✅ xFormers enabled")
+except Exception:
+    print("⚠️ xFormers not available")
+# Load & Fuse LoRA ONCE
+print("📦 Loading Detailer LoRA...")
+pipe.load_lora_weights(
+    "Lightricks/LTX-2-19b-IC-LoRA-Detailer",
+    adapter_name="detailer"
+)
+pipe.fuse_lora(lora_scale=0.8)
+pipe.unload_lora_weights()
+print("🔥 Model fully loaded on CUDA.")
+# ============================================================
+# 🎬 HELPER FUNCTIONS
+# ============================================================
+def save_video(video_frames, audio_path=None, fps=24):
+    output_filename = f"output_{random.randint(0, 100000)}.mp4"
+    # Convert frames
+    if isinstance(video_frames, list):
+        if video_frames and isinstance(video_frames[0], list):
+            frames = video_frames[0]
+        else:
+            frames = video_frames
+        np_frames = [np.array(img) for img in frames]
+        clip = ImageSequenceClip(np_frames, fps=fps)
+    elif isinstance(video_frames, str):
+        clip = VideoFileClip(video_frames)
+    else:
+        temp_path = "temp_video_no_audio.mp4"
+        export_to_video(video_frames, temp_path, fps=fps)
+        clip = VideoFileClip(temp_path)
+    if audio_path:
+        audio_clip = AudioFileClip(audio_path)
+        if audio_clip.duration > clip.duration:
+            audio_clip = audio_clip.subclipped(0, clip.duration)
+        clip = clip.with_audio(audio_clip)
+        audio_codec = "aac"
+    else:
+        audio_codec = None
+    clip.write_videofile(
+        output_filename,
+        fps=fps,
+        codec="libx264",
+        audio_codec=audio_codec,
+        logger=None
+    )
+    clip.close()
+    if audio_path:
+        audio_clip.close()
+    return output_filename
+def infer_aspect_ratio(image):
+    resolutions = {
+        "1:1": (512, 512),
+        "16:9": (768, 512),
+        "9:16": (512, 768)
+    }
+    width, height = image.size
+    image_ratio = width / height
+    aspect_ratios = {
+        "1:1": 1.0,
+        "16:9": 16 / 9,
+        "9:16": 9 / 16
+    }
+    closest_ratio = min(
+        aspect_ratios.keys(),
+        key=lambda k: abs(aspect_ratios[k] - image_ratio)
+    )
+    return resolutions[closest_ratio]
+def process_image_for_aspect_ratio(image):
+    target_w, target_h = infer_aspect_ratio(image)
+    processed_img = ImageOps.fit(
+        image,
+        (target_w, target_h),
+        method=Image.LANCZOS,
+        centering=(0.5, 0.5)
+    )
+    return processed_img, target_w, target_h
+def get_audio_duration(audio_path):
+    if audio_path is None:
+        return gr.update()
+    try:
+        audio_clip = AudioFileClip(audio_path)
+        duration = audio_clip.duration
+        audio_clip.close()
+        capped = min(duration, 12.0)
+        rounded = round(capped * 2) / 2
+        return gr.update(value=rounded)
+    except:
+        return gr.update()
+# ============================================================
+# 🎥 GENERATION FUNCTION (GPU ONLY HERE)
+# ============================================================
+@spaces.GPU(duration=85, size="xlarge")
+def generate(
+    image_path,
+    audio_path,
+    prompt,
+    negative_prompt,
+    video_duration,
+    seed,
+    progress=gr.Progress(track_tqdm=True)
+):
+    if not image_path:
+        raise gr.Error("Please provide an image.")
+    if seed == -1:
+        seed = random.randint(0, 1_000_000)
+    generator = torch.Generator(device="cuda").manual_seed(seed)
+    original_image = load_image(image_path)
+    image, width, height = process_image_for_aspect_ratio(original_image)
+    fps = 24.0
+    # If audio exists → override duration
+    if audio_path:
+        audio_clip = AudioFileClip(audio_path)
+        video_duration = min(audio_clip.duration, 12.0)
+        audio_clip.close()
+    total_frames = int(video_duration * fps)
+    base_block = round(total_frames / 8) * 8
+    num_frames = max(base_block + 1, 9)
+    print(f"Seed: {seed} | {width}x{height} | Frames: {num_frames}")
+    with torch.inference_mode():
+        if audio_path:
+            video_output, _ = pipe(
+                image=image,
+                audio=audio_path,
+                prompt=prompt,
+                negative_prompt=negative_prompt,
+                width=width,
+                height=height,
+                num_frames=num_frames,
+                frame_rate=fps,
+                num_inference_steps=8,
+                sigmas=DISTILLED_SIGMA_VALUES,
+                guidance_scale=1.0,
+                generator=generator,
+                return_dict=False,
+            )
+        else:
+            video_output = pipe(
+                image=image,
+                prompt=prompt,
+                negative_prompt=negative_prompt,
+                width=width,
+                height=height,
+                num_frames=num_frames,
+                frame_rate=fps,
+                num_inference_steps=8,
+                sigmas=DISTILLED_SIGMA_VALUES,
+                guidance_scale=1.0,
+                generator=generator,
+                return_dict=False,
+            )[0]
+    output_path = save_video(video_output, audio_path, fps=fps)
+    return output_path, seed
+# ============================================================
+# 🖥️ GRADIO UI
+# ============================================================
+css = "#col-container { max-width: 800px; margin: 0 auto; }"
+with gr.Blocks(css=css, theme=gr.themes.Soft()) as demo:
+    with gr.Column(elem_id="col-container"):
+        gr.Markdown("# ⚡ LTX-2 Distilled Image-to-Video (Audio Optional)")
+        with gr.Row():
+            with gr.Column():
+                input_image = gr.Image(type="filepath", height=300)
+                input_audio = gr.Audio(type="filepath", label="Optional Audio")
+            with gr.Column():
+                result_video = gr.Video()
+        prompt = gr.Textbox(
+            value="A person speaking naturally",
+            lines=2
+        )
+        video_duration = gr.Slider(1.0, 12.0, step=0.5, value=4.0)
+        with gr.Accordion("Advanced", open=False):
+            negative_prompt = gr.Textbox(
+                value="low quality, worst quality"
+            )
+            seed = gr.Number(value=-1, precision=0)
+        run_btn = gr.Button("Generate", variant="primary")
+        used_seed = gr.Number(visible=False)
+    input_audio.change(
+        fn=get_audio_duration,
+        inputs=[input_audio],
+        outputs=[video_duration]
+    )
+    run_btn.click(
+        fn=generate,
+        inputs=[
+            input_image,
+            input_audio,
+            prompt,
+            negative_prompt,
+            video_duration,
+            seed
+        ],
+        outputs=[result_video, used_seed]
+    )
+if __name__ == "__main__":
+    demo.queue().launch()