sam-audio-webui

Runtime error

App Files Files Community

Peter Shi commited on 8 days ago

Commit

c12dff8

1 Parent(s): 42e2df6

Add example with Git LFS for MP4

Browse files

Files changed (3) hide show

.gitattributes +1 -0
app.py +114 -20
examples/office.mp4 +3 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+*.mp4 filter=lfs diff=lfs merge=lfs -text

app.py CHANGED Viewed

@@ -4,19 +4,51 @@ import torch
 import torchaudio
 import tempfile
 import warnings
 warnings.filterwarnings("ignore")
 from sam_audio import SAMAudio, SAMAudioProcessor
-# Configuration
-MODEL_NAME = "facebook/sam-audio-small"
-# Load model and processor
-print(f"Loading {MODEL_NAME}...")
 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-model = SAMAudio.from_pretrained(MODEL_NAME).to(device).eval()
-processor = SAMAudioProcessor.from_pretrained(MODEL_NAME)
-print(f"Model loaded on {device}.")
 def save_audio(tensor, sample_rate):
     """Helper to save torch tensor to a temp file for Gradio output."""
@@ -25,9 +57,11 @@ def save_audio(tensor, sample_rate):
         return tmp.name
 @spaces.GPU(duration=300)
-def separate_audio(audio_path, video_path, text_prompt):
-    # Determine which input to use
-    file_path = video_path if video_path else audio_path
     if not file_path:
         return None, None, "❌ Please upload an audio or video file."
@@ -48,14 +82,27 @@ def separate_audio(audio_path, video_path, text_prompt):
         target_path = save_audio(result.target[0].unsqueeze(0).cpu(), sample_rate)
         residual_path = save_audio(result.residual[0].unsqueeze(0).cpu(), sample_rate)
-        return target_path, residual_path, f"✅ Successfully isolated '{text_prompt}'"
     except Exception as e:
         import traceback
         traceback.print_exc()
         return None, None, f"❌ Error: {str(e)}"
-# Build Gradio Interface - Simple and clean
 with gr.Blocks(title="SAM-Audio Demo") as demo:
     gr.Markdown(
         """
@@ -63,15 +110,23 @@ with gr.Blocks(title="SAM-Audio Demo") as demo:
         Isolate specific sounds from an audio or video file using natural language prompts.
-        **Model:** [facebook/sam-audio-small](https://huggingface.co/facebook/sam-audio-small)
         """
     )
     with gr.Row():
         with gr.Column():
-            gr.Markdown("### Upload Audio or Video")
-            input_audio = gr.Audio(label="Audio File", type="filepath")
-            input_video = gr.Video(label="Video File (MP4)")
             text_prompt = gr.Textbox(
                 label="Text Prompt",
@@ -88,14 +143,53 @@ with gr.Blocks(title="SAM-Audio Demo") as demo:
             output_residual = gr.Audio(label="Background (Residual)")
     gr.Markdown("---")
-    gr.Markdown("### Example Prompts")
-    gr.Markdown("- A man speaking\n- A woman singing\n- Piano\n- Drums\n- Guitar\n- Dog barking\n- Car engine")
     run_btn.click(
-        fn=separate_audio,
-        inputs=[input_audio, input_video, text_prompt],
         outputs=[output_target, output_residual, status_output]
     )
 if __name__ == "__main__":
     demo.launch()

 import torchaudio
 import tempfile
 import warnings
+import os
 warnings.filterwarnings("ignore")
 from sam_audio import SAMAudio, SAMAudioProcessor
+# Available models
+MODELS = {
+    "sam-audio-small": "facebook/sam-audio-small",
+    "sam-audio-base": "facebook/sam-audio-base",
+    "sam-audio-large": "facebook/sam-audio-large",
+    "sam-audio-small-tv (Visual)": "facebook/sam-audio-small-tv",
+    "sam-audio-base-tv (Visual)": "facebook/sam-audio-base-tv",
+    "sam-audio-large-tv (Visual)": "facebook/sam-audio-large-tv",
+}
+# Default model
+DEFAULT_MODEL = "sam-audio-small"
+# Example files
+EXAMPLES_DIR = "examples"
+EXAMPLE_FILE = os.path.join(EXAMPLES_DIR, "office.mp4")
+# Global model cache
 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+current_model_name = None
+model = None
+processor = None
+def load_model(model_name):
+    """Load or switch model."""
+    global current_model_name, model, processor
+    model_id = MODELS.get(model_name, MODELS[DEFAULT_MODEL])
+    if current_model_name == model_name and model is not None:
+        return
+    print(f"Loading {model_id}...")
+    model = SAMAudio.from_pretrained(model_id).to(device).eval()
+    processor = SAMAudioProcessor.from_pretrained(model_id)
+    current_model_name = model_name
+    print(f"Model {model_id} loaded on {device}.")
+# Load default model at startup
+load_model(DEFAULT_MODEL)
 def save_audio(tensor, sample_rate):
     """Helper to save torch tensor to a temp file for Gradio output."""
         return tmp.name
 @spaces.GPU(duration=300)
+def separate_audio(model_name, file_path, text_prompt):
+    global model, processor
+    # Load selected model if different
+    load_model(model_name)
     if not file_path:
         return None, None, "❌ Please upload an audio or video file."
         target_path = save_audio(result.target[0].unsqueeze(0).cpu(), sample_rate)
         residual_path = save_audio(result.residual[0].unsqueeze(0).cpu(), sample_rate)
+        return target_path, residual_path, f"✅ Successfully isolated '{text_prompt}' using {model_name}"
     except Exception as e:
         import traceback
         traceback.print_exc()
         return None, None, f"❌ Error: {str(e)}"
+def process_file(model_name, file, prompt):
+    if file is None:
+        return None, None, "❌ Please upload a file."
+    # Handle both file object and file path
+    file_path = file.name if hasattr(file, 'name') else file
+    return separate_audio(model_name, file_path, prompt)
+def process_example(model_name, file_path, prompt):
+    """Process directly from example - file_path is already a string."""
+    if not file_path or not os.path.exists(file_path):
+        return None, None, "❌ Example file not found."
+    return separate_audio(model_name, file_path, prompt)
+# Build Gradio Interface
 with gr.Blocks(title="SAM-Audio Demo") as demo:
     gr.Markdown(
         """
         Isolate specific sounds from an audio or video file using natural language prompts.
+        **Models:** [facebook/sam-audio](https://huggingface.co/collections/facebook/sam-audio-67608edbf75ad66bf5e8cb3a)
         """
     )
     with gr.Row():
         with gr.Column():
+            model_selector = gr.Dropdown(
+                choices=list(MODELS.keys()),
+                value=DEFAULT_MODEL,
+                label="Model",
+                info="Larger = better quality but slower. TV variants for visual prompting."
+            )
+            input_file = gr.File(
+                label="Upload Audio or Video",
+                file_types=[".mp3", ".wav", ".flac", ".ogg", ".m4a", ".mp4", ".mkv", ".avi", ".mov", ".webm"],
+            )
             text_prompt = gr.Textbox(
                 label="Text Prompt",
             output_residual = gr.Audio(label="Background (Residual)")
     gr.Markdown("---")
+    gr.Markdown("### 🎬 Try Demo Examples")
+    gr.Markdown("Click an example below to auto-fill and process:")
+    with gr.Row():
+        if os.path.exists(EXAMPLE_FILE):
+            example_btn1 = gr.Button("🎤 Man Speaking")
+            example_btn2 = gr.Button("🎤 Woman Speaking")
+            example_btn3 = gr.Button("🎵 Background Music")
+    gr.Markdown("---")
+    gr.Markdown("**Supported formats:** MP3, WAV, FLAC, OGG, M4A, MP4, MKV, AVI, MOV, WebM")
+    # Main run button
     run_btn.click(
+        fn=process_file,
+        inputs=[model_selector, input_file, text_prompt],
         outputs=[output_target, output_residual, status_output]
     )
+    # Example buttons - auto-fill and process
+    if os.path.exists(EXAMPLE_FILE):
+        example_btn1.click(
+            fn=lambda: (EXAMPLE_FILE, "A man speaking"),
+            outputs=[input_file, text_prompt]
+        ).then(
+            fn=lambda m: process_example(m, EXAMPLE_FILE, "A man speaking"),
+            inputs=[model_selector],
+            outputs=[output_target, output_residual, status_output]
+        )
+        example_btn2.click(
+            fn=lambda: (EXAMPLE_FILE, "A woman speaking"),
+            outputs=[input_file, text_prompt]
+        ).then(
+            fn=lambda m: process_example(m, EXAMPLE_FILE, "A woman speaking"),
+            inputs=[model_selector],
+            outputs=[output_target, output_residual, status_output]
+        )
+        example_btn3.click(
+            fn=lambda: (EXAMPLE_FILE, "Background music"),
+            outputs=[input_file, text_prompt]
+        ).then(
+            fn=lambda m: process_example(m, EXAMPLE_FILE, "Background music"),
+            inputs=[model_selector],
+            outputs=[output_target, output_residual, status_output]
+        )
 if __name__ == "__main__":
     demo.launch()

examples/office.mp4 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c0f583ff34c5fd9d1a83d640e7c0131ad339755bd69e54f104723b707f213c21
+size 4551702