rahul7star commited on
Commit
8f29086
·
verified ·
1 Parent(s): cf97785

Create app_noaud.py

Browse files
Files changed (1) hide show
  1. app_noaud.py +295 -0
app_noaud.py ADDED
@@ -0,0 +1,295 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import spaces
2
+ import gradio as gr
3
+ import torch
4
+ from diffusers import DiffusionPipeline
5
+ from diffusers.utils import load_image, export_to_video
6
+ import random
7
+ import numpy as np
8
+ from moviepy import ImageSequenceClip, AudioFileClip, VideoFileClip
9
+ from PIL import Image, ImageOps
10
+ import os
11
+
12
+ # ============================================================
13
+ # 🔥 GLOBAL PERFORMANCE SETTINGS (H200 OPTIMIZED)
14
+ # ============================================================
15
+
16
+ torch.backends.cuda.matmul.allow_tf32 = True
17
+ torch.backends.cudnn.allow_tf32 = True
18
+ torch.set_grad_enabled(False)
19
+
20
+ torch.backends.cuda.enable_flash_sdp(True)
21
+ torch.backends.cuda.enable_mem_efficient_sdp(True)
22
+
23
+ DEVICE = "cuda"
24
+ DTYPE = torch.bfloat16
25
+
26
+ # ============================================================
27
+ # 🎯 DISTILLED SIGMAS
28
+ # ============================================================
29
+
30
+ DISTILLED_SIGMA_VALUES = [
31
+ 1.0, 0.99375, 0.9875, 0.98125, 0.975, 0.909375, 0.725, 0.421875
32
+ ]
33
+
34
+ # ============================================================
35
+ # 🚀 LOAD MODEL ON STARTUP (ONLY ONCE)
36
+ # ============================================================
37
+
38
+ print("🚀 Loading LTX-2 Distilled on H200...")
39
+
40
+ pipe = DiffusionPipeline.from_pretrained(
41
+ "rootonchair/LTX-2-19b-distilled",
42
+ custom_pipeline="multimodalart/ltx2-audio-to-video",
43
+ torch_dtype=DTYPE,
44
+ )
45
+
46
+ pipe.to(DEVICE)
47
+
48
+ # Enable memory efficient attention
49
+ try:
50
+ pipe.enable_xformers_memory_efficient_attention()
51
+ print("✅ xFormers enabled")
52
+ except Exception:
53
+ print("⚠️ xFormers not available")
54
+
55
+ # Load & Fuse LoRA ONCE
56
+ print("📦 Loading Detailer LoRA...")
57
+ pipe.load_lora_weights(
58
+ "Lightricks/LTX-2-19b-IC-LoRA-Detailer",
59
+ adapter_name="detailer"
60
+ )
61
+ pipe.fuse_lora(lora_scale=0.8)
62
+ pipe.unload_lora_weights()
63
+
64
+ print("🔥 Model fully loaded on CUDA.")
65
+
66
+ # ============================================================
67
+ # 🎬 HELPER FUNCTIONS
68
+ # ============================================================
69
+
70
+ def save_video(video_frames, audio_path=None, fps=24):
71
+ output_filename = f"output_{random.randint(0, 100000)}.mp4"
72
+
73
+ # Convert frames
74
+ if isinstance(video_frames, list):
75
+ if video_frames and isinstance(video_frames[0], list):
76
+ frames = video_frames[0]
77
+ else:
78
+ frames = video_frames
79
+ np_frames = [np.array(img) for img in frames]
80
+ clip = ImageSequenceClip(np_frames, fps=fps)
81
+ elif isinstance(video_frames, str):
82
+ clip = VideoFileClip(video_frames)
83
+ else:
84
+ temp_path = "temp_video_no_audio.mp4"
85
+ export_to_video(video_frames, temp_path, fps=fps)
86
+ clip = VideoFileClip(temp_path)
87
+
88
+ if audio_path:
89
+ audio_clip = AudioFileClip(audio_path)
90
+
91
+ if audio_clip.duration > clip.duration:
92
+ audio_clip = audio_clip.subclipped(0, clip.duration)
93
+
94
+ clip = clip.with_audio(audio_clip)
95
+ audio_codec = "aac"
96
+ else:
97
+ audio_codec = None
98
+
99
+ clip.write_videofile(
100
+ output_filename,
101
+ fps=fps,
102
+ codec="libx264",
103
+ audio_codec=audio_codec,
104
+ logger=None
105
+ )
106
+
107
+ clip.close()
108
+ if audio_path:
109
+ audio_clip.close()
110
+
111
+ return output_filename
112
+
113
+
114
+ def infer_aspect_ratio(image):
115
+ resolutions = {
116
+ "1:1": (512, 512),
117
+ "16:9": (768, 512),
118
+ "9:16": (512, 768)
119
+ }
120
+
121
+ width, height = image.size
122
+ image_ratio = width / height
123
+
124
+ aspect_ratios = {
125
+ "1:1": 1.0,
126
+ "16:9": 16 / 9,
127
+ "9:16": 9 / 16
128
+ }
129
+
130
+ closest_ratio = min(
131
+ aspect_ratios.keys(),
132
+ key=lambda k: abs(aspect_ratios[k] - image_ratio)
133
+ )
134
+
135
+ return resolutions[closest_ratio]
136
+
137
+
138
+ def process_image_for_aspect_ratio(image):
139
+ target_w, target_h = infer_aspect_ratio(image)
140
+
141
+ processed_img = ImageOps.fit(
142
+ image,
143
+ (target_w, target_h),
144
+ method=Image.LANCZOS,
145
+ centering=(0.5, 0.5)
146
+ )
147
+
148
+ return processed_img, target_w, target_h
149
+
150
+
151
+ def get_audio_duration(audio_path):
152
+ if audio_path is None:
153
+ return gr.update()
154
+
155
+ try:
156
+ audio_clip = AudioFileClip(audio_path)
157
+ duration = audio_clip.duration
158
+ audio_clip.close()
159
+
160
+ capped = min(duration, 12.0)
161
+ rounded = round(capped * 2) / 2
162
+ return gr.update(value=rounded)
163
+ except:
164
+ return gr.update()
165
+
166
+ # ============================================================
167
+ # 🎥 GENERATION FUNCTION (GPU ONLY HERE)
168
+ # ============================================================
169
+
170
+ @spaces.GPU(duration=85, size="xlarge")
171
+ def generate(
172
+ image_path,
173
+ audio_path,
174
+ prompt,
175
+ negative_prompt,
176
+ video_duration,
177
+ seed,
178
+ progress=gr.Progress(track_tqdm=True)
179
+ ):
180
+ if not image_path:
181
+ raise gr.Error("Please provide an image.")
182
+
183
+ if seed == -1:
184
+ seed = random.randint(0, 1_000_000)
185
+
186
+ generator = torch.Generator(device="cuda").manual_seed(seed)
187
+
188
+ original_image = load_image(image_path)
189
+ image, width, height = process_image_for_aspect_ratio(original_image)
190
+
191
+ fps = 24.0
192
+
193
+ # If audio exists → override duration
194
+ if audio_path:
195
+ audio_clip = AudioFileClip(audio_path)
196
+ video_duration = min(audio_clip.duration, 12.0)
197
+ audio_clip.close()
198
+
199
+ total_frames = int(video_duration * fps)
200
+ base_block = round(total_frames / 8) * 8
201
+ num_frames = max(base_block + 1, 9)
202
+
203
+ print(f"Seed: {seed} | {width}x{height} | Frames: {num_frames}")
204
+
205
+ with torch.inference_mode():
206
+ if audio_path:
207
+ video_output, _ = pipe(
208
+ image=image,
209
+ audio=audio_path,
210
+ prompt=prompt,
211
+ negative_prompt=negative_prompt,
212
+ width=width,
213
+ height=height,
214
+ num_frames=num_frames,
215
+ frame_rate=fps,
216
+ num_inference_steps=8,
217
+ sigmas=DISTILLED_SIGMA_VALUES,
218
+ guidance_scale=1.0,
219
+ generator=generator,
220
+ return_dict=False,
221
+ )
222
+ else:
223
+ video_output = pipe(
224
+ image=image,
225
+ prompt=prompt,
226
+ negative_prompt=negative_prompt,
227
+ width=width,
228
+ height=height,
229
+ num_frames=num_frames,
230
+ frame_rate=fps,
231
+ num_inference_steps=8,
232
+ sigmas=DISTILLED_SIGMA_VALUES,
233
+ guidance_scale=1.0,
234
+ generator=generator,
235
+ return_dict=False,
236
+ )[0]
237
+
238
+ output_path = save_video(video_output, audio_path, fps=fps)
239
+
240
+ return output_path, seed
241
+
242
+ # ============================================================
243
+ # 🖥️ GRADIO UI
244
+ # ============================================================
245
+
246
+ css = "#col-container { max-width: 800px; margin: 0 auto; }"
247
+
248
+ with gr.Blocks(css=css, theme=gr.themes.Soft()) as demo:
249
+ with gr.Column(elem_id="col-container"):
250
+ gr.Markdown("# ⚡ LTX-2 Distilled Image-to-Video (Audio Optional)")
251
+
252
+ with gr.Row():
253
+ with gr.Column():
254
+ input_image = gr.Image(type="filepath", height=300)
255
+ input_audio = gr.Audio(type="filepath", label="Optional Audio")
256
+ with gr.Column():
257
+ result_video = gr.Video()
258
+
259
+ prompt = gr.Textbox(
260
+ value="A person speaking naturally",
261
+ lines=2
262
+ )
263
+
264
+ video_duration = gr.Slider(1.0, 12.0, step=0.5, value=4.0)
265
+
266
+ with gr.Accordion("Advanced", open=False):
267
+ negative_prompt = gr.Textbox(
268
+ value="low quality, worst quality"
269
+ )
270
+ seed = gr.Number(value=-1, precision=0)
271
+
272
+ run_btn = gr.Button("Generate", variant="primary")
273
+ used_seed = gr.Number(visible=False)
274
+
275
+ input_audio.change(
276
+ fn=get_audio_duration,
277
+ inputs=[input_audio],
278
+ outputs=[video_duration]
279
+ )
280
+
281
+ run_btn.click(
282
+ fn=generate,
283
+ inputs=[
284
+ input_image,
285
+ input_audio,
286
+ prompt,
287
+ negative_prompt,
288
+ video_duration,
289
+ seed
290
+ ],
291
+ outputs=[result_video, used_seed]
292
+ )
293
+
294
+ if __name__ == "__main__":
295
+ demo.queue().launch()