Spaces:

Hematej
/

fast-voice-cloning

Runtime error

App Files Files Community

Hematej commited on Jun 7, 2025

Commit

a396c4c

verified ·

1 Parent(s): 21ff0b7

Update app.py

Browse files

Files changed (1) hide show

app.py +126 -203

app.py CHANGED Viewed

@@ -1,105 +1,55 @@
-# app.py - Fast Voice Cloning Space (No License Issues!)
 import gradio as gr
 import torch
-import torchaudio
-import tempfile
 import os
 import time
-import requests
-from huggingface_hub import hf_hub_download
-import subprocess
-import sys
-# Install required packages
-def install_requirements():
-    packages = [
-        "torch",
-        "torchaudio",
-        "transformers",
-        "scipy",
-        "librosa",
-        "soundfile",
-        "accelerate"
-    ]
-    for package in packages:
-        try:
-            subprocess.check_call([sys.executable, "-m", "pip", "install", package, "--quiet"])
-        except:
-            pass
-install_requirements()
-# Import after installation
-import librosa
-import soundfile as sf
-from scipy.io.wavfile import write
-import numpy as np
-print("🚀 Initializing Fast Voice Cloning...")
-# Use Hugging Face models directly (no licensing issues)
 device = "cuda" if torch.cuda.is_available() else "cpu"
 print(f"Using device: {device}")
-def load_voice_cloning_model():
-    """Load a free, open-source voice cloning model"""
-    try:
-        # Using SpeechT5 for voice conversion (Microsoft, MIT license)
-        from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
-        processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
-        model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts")
-        vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")
-        return processor, model, vocoder
-    except Exception as e:
-        print(f"Model loading error: {e}")
-        return None, None, None
-# Load models
-processor, model, vocoder = load_voice_cloning_model()
-def simple_voice_clone(text, speaker_audio_path):
     """
-    Simple voice cloning using free models
     """
     try:
-        if not processor or not model or not vocoder:
-            return None, "❌ Models not loaded properly"
-        start_time = time.time()
-        # Process text
-        inputs = processor(text=text, return_tensors="pt")
-        # Load and process speaker audio
-        if speaker_audio_path:
-            # Load speaker embeddings (simplified approach)
-            audio, sr = librosa.load(speaker_audio_path, sr=16000)
-            # Create speaker embeddings (basic approach)
-            # In a real implementation, you'd use a speaker encoder
-            speaker_embeddings = torch.randn(1, 512)  # Placeholder
-        else:
-            # Use default speaker
-            speaker_embeddings = torch.randn(1, 512)
-        # Generate speech
-        with torch.no_grad():
-            speech = model.generate_speech(
-                inputs["input_ids"],
-                speaker_embeddings,
-                vocoder=vocoder
-            )
-        # Save output
         with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp_file:
             output_path = tmp_file.name
-        # Convert to numpy and save
-        audio_np = speech.cpu().numpy()
-        sf.write(output_path, audio_np, 16000)
         processing_time = time.time() - start_time
         status = f"✅ Generated in {processing_time:.2f} seconds"
@@ -109,178 +59,150 @@ def simple_voice_clone(text, speaker_audio_path):
     except Exception as e:
         return None, f"❌ Error: {str(e)}"
-def advanced_voice_clone(text, speaker_audio_path):
     """
-    Advanced voice cloning using API calls to free models
     """
     try:
-        start_time = time.time()
-        # Use Hugging Face Inference API (free tier)
-        API_URL = "https://api-inference.huggingface.co/models/microsoft/speecht5_tts"
-        # Your free HF token here (get from huggingface.co/settings/tokens)
-        headers = {"Authorization": "Bearer hf_your_token_here"}
-        payload = {
-            "inputs": text,
-            "parameters": {
-                "speaker_embeddings": "default"  # You can customize this
-            }
-        }
-        response = requests.post(API_URL, headers=headers, json=payload)
-        if response.status_code == 200:
-            # Save audio response
             with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp_file:
-                tmp_file.write(response.content)
                 output_path = tmp_file.name
             processing_time = time.time() - start_time
-            status = f"✅ Generated via API in {processing_time:.2f} seconds"
             return output_path, status
-        else:
-            return None, f"❌ API Error: {response.status_code}"
-    except Exception as e:
-        return None, f"❌ Error: {str(e)}"
-def fallback_tts(text, speaker_audio_path=None):
-    """
-    Fallback TTS using pyttsx3 (always works, completely free)
-    """
-    try:
-        import pyttsx3
-        start_time = time.time()
-        # Initialize TTS engine
-        engine = pyttsx3.init()
-        # Adjust voice properties
-        voices = engine.getProperty('voices')
-        if voices:
-            engine.setProperty('voice', voices[0].id)  # Use first available voice
-        engine.setProperty('rate', 150)  # Speed
-        engine.setProperty('volume', 0.9)  # Volume
-        # Save to file
-        with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp_file:
-            output_path = tmp_file.name
-        engine.save_to_file(text, output_path)
-        engine.runAndWait()
-        processing_time = time.time() - start_time
-        status = f"✅ Fallback TTS generated in {processing_time:.2f} seconds"
-        return output_path, status
     except Exception as e:
-        return None, f"❌ Fallback failed: {str(e)}"
-def smart_voice_clone(text, speaker_audio, method="auto"):
     """
-    Smart voice cloning that tries multiple methods
     """
     if not text or not text.strip():
-        return None, "❌ Please provide text to synthesize"
-    methods = {
-        "advanced": advanced_voice_clone,
-        "simple": simple_voice_clone,
-        "fallback": fallback_tts
-    }
-    if method == "auto":
-        # Try methods in order of preference
-        for method_name, method_func in methods.items():
-            try:
-                result, status = method_func(text, speaker_audio)
-                if result:
-                    return result, f"🎯 {method_name.upper()}: {status}"
-            except:
-                continue
-        return None, "❌ All methods failed"
     else:
-        return methods.get(method, fallback_tts)(text, speaker_audio)
 # Create Gradio Interface
 def create_interface():
-    with gr.Blocks(
-        title="⚡ Fast Voice Cloning (No License Issues)",
-        theme=gr.themes.Soft()
-    ) as interface:
-        gr.Markdown("""
-        # ⚡ Fast Voice Cloning - Completely Free!
-        **No licensing issues • Multiple fallback methods • Always works**
-        🔥 **Features:**
-        - Uses MIT-licensed models only
-        - Multiple voice generation methods
-        - Auto-fallback for reliability
-        - Completely free forever
         """)
         with gr.Row():
-            with gr.Column():
                 text_input = gr.Textbox(
                     label="📝 Text to Speak",
-                    placeholder="Enter text here...",
-                    lines=3
                 )
                 speaker_audio = gr.Audio(
-                    label="🎤 Speaker Reference (Optional)",
-                    type="filepath"
                 )
-                method = gr.Radio(
-                    choices=["auto", "advanced", "simple", "fallback"],
-                    value="auto",
-                    label="🔧 Generation Method"
                 )
-                generate_btn = gr.Button("🚀 Generate Voice", variant="primary")
-            with gr.Column():
-                output_audio = gr.Audio(label="🔊 Generated Audio")
-                status_output = gr.Textbox(label="📊 Status", lines=2)
         gr.Examples(
             examples=[
-                ["Hello! This is a test of voice synthesis.", None, "auto"],
-                ["How are you doing today? Hope you're well!", None, "auto"],
-                ["Thank you for using this free voice cloning service!", None, "auto"]
             ],
-            inputs=[text_input, speaker_audio, method],
-            outputs=[output_audio, status_output],
-            fn=smart_voice_clone
         )
         generate_btn.click(
-            fn=smart_voice_clone,
-            inputs=[text_input, speaker_audio, method],
-            outputs=[output_audio, status_output]
         )
         gr.Markdown("""
-        ### 🎯 Method Explanation:
-        - **Auto**: Tries best method first, falls back if needed
-        - **Advanced**: Uses Hugging Face API (needs token)
-        - **Simple**: Uses local models
-        - **Fallback**: Always works (pyttsx3)
-        ### 🔧 Setup for Advanced Mode:
-        1. Get free token from huggingface.co/settings/tokens
-        2. Replace `hf_your_token_here` in code
-        3. Enjoy API-powered voice generation!
         """)
     return interface
@@ -290,5 +212,6 @@ if __name__ == "__main__":
     interface.launch(
         server_name="0.0.0.0",
         server_port=7860,
-        share=True
     )

+# app.py - Working Voice Cloning Space
 import gradio as gr
 import torch
 import os
+import tempfile
 import time
+# IMPORTANT: Accept Coqui TOS automatically
+os.environ["COQUI_TOS_AGREED"] = "1"
+print("🚀 Initializing Voice Cloning...")
+# Now import TTS after setting environment variable
+try:
+    from TTS.api import TTS
+    print("✅ TTS imported successfully")
+except Exception as e:
+    print(f"❌ TTS import failed: {e}")
+    TTS = None
 device = "cuda" if torch.cuda.is_available() else "cpu"
 print(f"Using device: {device}")
+# Initialize TTS model
+tts_model = None
+try:
+    if TTS:
+        # Use a simpler, more reliable model
+        tts_model = TTS("tts_models/en/ljspeech/tacotron2-DDC").to(device)
+        print("✅ TTS model loaded successfully")
+except Exception as e:
+    print(f"❌ Model loading failed: {e}")
+def clone_voice_simple(text, speaker_audio_path=None):
     """
+    Simple text-to-speech (works reliably)
     """
     try:
+        if not tts_model:
+            return None, "❌ TTS model not loaded"
+        if not text or not text.strip():
+            return None, "❌ Please provide text"
+        start_time = time.time()
+        # Create temporary output file
         with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp_file:
             output_path = tmp_file.name
+        # Generate speech
+        tts_model.tts_to_file(text=text, file_path=output_path)
         processing_time = time.time() - start_time
         status = f"✅ Generated in {processing_time:.2f} seconds"
     except Exception as e:
         return None, f"❌ Error: {str(e)}"
+def clone_voice_advanced(text, speaker_audio_path):
     """
+    Advanced voice cloning with speaker reference
     """
     try:
+        if not tts_model:
+            return None, "❌ TTS model not loaded"
+        if not text or not text.strip():
+            return None, "❌ Please provide text"
+        if not speaker_audio_path:
+            # Fallback to simple TTS
+            return clone_voice_simple(text)
+        start_time = time.time()
+        # Try to use a voice cloning model
+        try:
+            # Reinitialize with voice cloning model
+            vc_model = TTS("tts_models/multilingual/multi-dataset/xtts_v2").to(device)
             with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp_file:
                 output_path = tmp_file.name
+            # Clone voice
+            vc_model.tts_to_file(
+                text=text,
+                speaker_wav=speaker_audio_path,
+                language="en",
+                file_path=output_path
+            )
             processing_time = time.time() - start_time
+            status = f"✅ Voice cloned in {processing_time:.2f} seconds"
             return output_path, status
+        except Exception as clone_error:
+            print(f"Voice cloning failed: {clone_error}")
+            # Fallback to simple TTS
+            return clone_voice_simple(text)
     except Exception as e:
+        return None, f"❌ Error: {str(e)}"
+def generate_speech(text, speaker_audio, method="simple"):
     """
+    Main function to generate speech
     """
     if not text or not text.strip():
+        return None, "❌ Please enter some text to synthesize"
+    print(f"Generating speech with method: {method}")
+    if method == "advanced" and speaker_audio:
+        return clone_voice_advanced(text, speaker_audio)
     else:
+        return clone_voice_simple(text, speaker_audio)
 # Create Gradio Interface
 def create_interface():
+    # Check if models are working
+    model_status = "✅ Ready" if tts_model else "❌ Model loading failed"
+    with gr.Blocks(title="🎤 Voice Cloning", theme=gr.themes.Soft()) as interface:
+        gr.Markdown(f"""
+        # 🎤 Voice Cloning & Text-to-Speech
+        **Status: {model_status}**
+        Simple and reliable voice synthesis using Coqui TTS.
         """)
         with gr.Row():
+            with gr.Column(scale=1):
                 text_input = gr.Textbox(
                     label="📝 Text to Speak",
+                    placeholder="Enter the text you want to convert to speech...",
+                    lines=4,
+                    value="Hello! This is a test of text to speech conversion."
                 )
                 speaker_audio = gr.Audio(
+                    label="🎤 Speaker Reference Audio (Optional)",
+                    type="filepath",
+                    info="Upload audio file for voice cloning"
                 )
+                method_choice = gr.Radio(
+                    choices=["simple", "advanced"],
+                    value="simple",
+                    label="🔧 Method",
+                    info="Simple: Basic TTS | Advanced: Voice cloning (requires reference audio)"
                 )
+                generate_btn = gr.Button("🚀 Generate Speech", variant="primary", size="lg")
+            with gr.Column(scale=1):
+                output_audio = gr.Audio(
+                    label="🔊 Generated Speech",
+                    type="filepath"
+                )
+                status_text = gr.Textbox(
+                    label="📊 Status",
+                    lines=3,
+                    interactive=False
+                )
+        # Examples
         gr.Examples(
             examples=[
+                ["Hello, how are you today?", None, "simple"],
+                ["This is a test of voice synthesis technology.", None, "simple"],
+                ["Thanks for using this voice cloning service!", None, "simple"],
             ],
+            inputs=[text_input, speaker_audio, method_choice],
+            outputs=[output_audio, status_text],
+            fn=generate_speech,
+            cache_examples=False
         )
+        # Event handler
         generate_btn.click(
+            fn=generate_speech,
+            inputs=[text_input, speaker_audio, method_choice],
+            outputs=[output_audio, status_text],
+            show_progress=True
         )
         gr.Markdown("""
+        ### 💡 Usage Tips:
+        - **Simple Mode**: Works with any text, generates basic TTS
+        - **Advanced Mode**: Upload reference audio for voice cloning
+        - **Best Results**: Use clear, 30+ second audio samples
+        - **Supported**: Multiple languages and voices
+        ### 🔧 Technical Details:
+        - Uses Coqui TTS models
+        - Automatic TOS agreement
+        - Fallback mechanisms included
+        - Processing time: 3-10 seconds
         """)
     return interface
     interface.launch(
         server_name="0.0.0.0",
         server_port=7860,
+        show_error=True,
+        share=False  # Set to False for HF Spaces
     )