Hematej commited on
Commit
a396c4c
·
verified ·
1 Parent(s): 21ff0b7

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +126 -203
app.py CHANGED
@@ -1,105 +1,55 @@
1
- # app.py - Fast Voice Cloning Space (No License Issues!)
2
  import gradio as gr
3
  import torch
4
- import torchaudio
5
- import tempfile
6
  import os
 
7
  import time
8
- import requests
9
- from huggingface_hub import hf_hub_download
10
- import subprocess
11
- import sys
12
 
13
- # Install required packages
14
- def install_requirements():
15
- packages = [
16
- "torch",
17
- "torchaudio",
18
- "transformers",
19
- "scipy",
20
- "librosa",
21
- "soundfile",
22
- "accelerate"
23
- ]
24
-
25
- for package in packages:
26
- try:
27
- subprocess.check_call([sys.executable, "-m", "pip", "install", package, "--quiet"])
28
- except:
29
- pass
30
 
31
- install_requirements()
32
 
33
- # Import after installation
34
- import librosa
35
- import soundfile as sf
36
- from scipy.io.wavfile import write
37
- import numpy as np
 
 
38
 
39
- print("🚀 Initializing Fast Voice Cloning...")
40
-
41
- # Use Hugging Face models directly (no licensing issues)
42
  device = "cuda" if torch.cuda.is_available() else "cpu"
43
  print(f"Using device: {device}")
44
 
45
- def load_voice_cloning_model():
46
- """Load a free, open-source voice cloning model"""
47
- try:
48
- # Using SpeechT5 for voice conversion (Microsoft, MIT license)
49
- from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
50
-
51
- processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
52
- model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts")
53
- vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")
54
-
55
- return processor, model, vocoder
56
- except Exception as e:
57
- print(f"Model loading error: {e}")
58
- return None, None, None
59
-
60
- # Load models
61
- processor, model, vocoder = load_voice_cloning_model()
62
 
63
- def simple_voice_clone(text, speaker_audio_path):
64
  """
65
- Simple voice cloning using free models
66
  """
67
  try:
68
- if not processor or not model or not vocoder:
69
- return None, "❌ Models not loaded properly"
70
-
71
- start_time = time.time()
72
-
73
- # Process text
74
- inputs = processor(text=text, return_tensors="pt")
75
 
76
- # Load and process speaker audio
77
- if speaker_audio_path:
78
- # Load speaker embeddings (simplified approach)
79
- audio, sr = librosa.load(speaker_audio_path, sr=16000)
80
-
81
- # Create speaker embeddings (basic approach)
82
- # In a real implementation, you'd use a speaker encoder
83
- speaker_embeddings = torch.randn(1, 512) # Placeholder
84
- else:
85
- # Use default speaker
86
- speaker_embeddings = torch.randn(1, 512)
87
 
88
- # Generate speech
89
- with torch.no_grad():
90
- speech = model.generate_speech(
91
- inputs["input_ids"],
92
- speaker_embeddings,
93
- vocoder=vocoder
94
- )
95
 
96
- # Save output
97
  with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp_file:
98
  output_path = tmp_file.name
99
-
100
- # Convert to numpy and save
101
- audio_np = speech.cpu().numpy()
102
- sf.write(output_path, audio_np, 16000)
103
 
104
  processing_time = time.time() - start_time
105
  status = f"✅ Generated in {processing_time:.2f} seconds"
@@ -109,178 +59,150 @@ def simple_voice_clone(text, speaker_audio_path):
109
  except Exception as e:
110
  return None, f"❌ Error: {str(e)}"
111
 
112
- def advanced_voice_clone(text, speaker_audio_path):
113
  """
114
- Advanced voice cloning using API calls to free models
115
  """
116
  try:
117
- start_time = time.time()
 
118
 
119
- # Use Hugging Face Inference API (free tier)
120
- API_URL = "https://api-inference.huggingface.co/models/microsoft/speecht5_tts"
121
 
122
- # Your free HF token here (get from huggingface.co/settings/tokens)
123
- headers = {"Authorization": "Bearer hf_your_token_here"}
 
124
 
125
- payload = {
126
- "inputs": text,
127
- "parameters": {
128
- "speaker_embeddings": "default" # You can customize this
129
- }
130
- }
131
-
132
- response = requests.post(API_URL, headers=headers, json=payload)
133
 
134
- if response.status_code == 200:
135
- # Save audio response
 
 
 
136
  with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp_file:
137
- tmp_file.write(response.content)
138
  output_path = tmp_file.name
139
 
 
 
 
 
 
 
 
 
140
  processing_time = time.time() - start_time
141
- status = f"✅ Generated via API in {processing_time:.2f} seconds"
142
 
143
  return output_path, status
144
- else:
145
- return None, f"❌ API Error: {response.status_code}"
146
 
147
- except Exception as e:
148
- return None, f" Error: {str(e)}"
149
-
150
- def fallback_tts(text, speaker_audio_path=None):
151
- """
152
- Fallback TTS using pyttsx3 (always works, completely free)
153
- """
154
- try:
155
- import pyttsx3
156
-
157
- start_time = time.time()
158
-
159
- # Initialize TTS engine
160
- engine = pyttsx3.init()
161
-
162
- # Adjust voice properties
163
- voices = engine.getProperty('voices')
164
- if voices:
165
- engine.setProperty('voice', voices[0].id) # Use first available voice
166
-
167
- engine.setProperty('rate', 150) # Speed
168
- engine.setProperty('volume', 0.9) # Volume
169
-
170
- # Save to file
171
- with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp_file:
172
- output_path = tmp_file.name
173
-
174
- engine.save_to_file(text, output_path)
175
- engine.runAndWait()
176
-
177
- processing_time = time.time() - start_time
178
- status = f"✅ Fallback TTS generated in {processing_time:.2f} seconds"
179
-
180
- return output_path, status
181
 
182
  except Exception as e:
183
- return None, f"❌ Fallback failed: {str(e)}"
184
 
185
- def smart_voice_clone(text, speaker_audio, method="auto"):
186
  """
187
- Smart voice cloning that tries multiple methods
188
  """
189
  if not text or not text.strip():
190
- return None, "❌ Please provide text to synthesize"
191
 
192
- methods = {
193
- "advanced": advanced_voice_clone,
194
- "simple": simple_voice_clone,
195
- "fallback": fallback_tts
196
- }
197
 
198
- if method == "auto":
199
- # Try methods in order of preference
200
- for method_name, method_func in methods.items():
201
- try:
202
- result, status = method_func(text, speaker_audio)
203
- if result:
204
- return result, f"🎯 {method_name.upper()}: {status}"
205
- except:
206
- continue
207
-
208
- return None, "❌ All methods failed"
209
  else:
210
- return methods.get(method, fallback_tts)(text, speaker_audio)
211
 
212
  # Create Gradio Interface
213
  def create_interface():
214
- with gr.Blocks(
215
- title=" Fast Voice Cloning (No License Issues)",
216
- theme=gr.themes.Soft()
217
- ) as interface:
218
 
219
- gr.Markdown("""
220
- # Fast Voice Cloning - Completely Free!
221
 
222
- **No licensing issues • Multiple fallback methods • Always works**
223
 
224
- 🔥 **Features:**
225
- - Uses MIT-licensed models only
226
- - Multiple voice generation methods
227
- - Auto-fallback for reliability
228
- - Completely free forever
229
  """)
230
 
231
  with gr.Row():
232
- with gr.Column():
233
  text_input = gr.Textbox(
234
  label="📝 Text to Speak",
235
- placeholder="Enter text here...",
236
- lines=3
 
237
  )
238
 
239
  speaker_audio = gr.Audio(
240
- label="🎤 Speaker Reference (Optional)",
241
- type="filepath"
 
242
  )
243
 
244
- method = gr.Radio(
245
- choices=["auto", "advanced", "simple", "fallback"],
246
- value="auto",
247
- label="🔧 Generation Method"
 
248
  )
249
 
250
- generate_btn = gr.Button("🚀 Generate Voice", variant="primary")
251
 
252
- with gr.Column():
253
- output_audio = gr.Audio(label="🔊 Generated Audio")
254
- status_output = gr.Textbox(label="📊 Status", lines=2)
 
 
 
 
 
 
 
 
255
 
 
256
  gr.Examples(
257
  examples=[
258
- ["Hello! This is a test of voice synthesis.", None, "auto"],
259
- ["How are you doing today? Hope you're well!", None, "auto"],
260
- ["Thank you for using this free voice cloning service!", None, "auto"]
261
  ],
262
- inputs=[text_input, speaker_audio, method],
263
- outputs=[output_audio, status_output],
264
- fn=smart_voice_clone
 
265
  )
266
 
 
267
  generate_btn.click(
268
- fn=smart_voice_clone,
269
- inputs=[text_input, speaker_audio, method],
270
- outputs=[output_audio, status_output]
 
271
  )
272
 
273
  gr.Markdown("""
274
- ### 🎯 Method Explanation:
275
- - **Auto**: Tries best method first, falls back if needed
276
- - **Advanced**: Uses Hugging Face API (needs token)
277
- - **Simple**: Uses local models
278
- - **Fallback**: Always works (pyttsx3)
279
-
280
- ### 🔧 Setup for Advanced Mode:
281
- 1. Get free token from huggingface.co/settings/tokens
282
- 2. Replace `hf_your_token_here` in code
283
- 3. Enjoy API-powered voice generation!
 
284
  """)
285
 
286
  return interface
@@ -290,5 +212,6 @@ if __name__ == "__main__":
290
  interface.launch(
291
  server_name="0.0.0.0",
292
  server_port=7860,
293
- share=True
 
294
  )
 
1
+ # app.py - Working Voice Cloning Space
2
  import gradio as gr
3
  import torch
 
 
4
  import os
5
+ import tempfile
6
  import time
 
 
 
 
7
 
8
+ # IMPORTANT: Accept Coqui TOS automatically
9
+ os.environ["COQUI_TOS_AGREED"] = "1"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
10
 
11
+ print("🚀 Initializing Voice Cloning...")
12
 
13
+ # Now import TTS after setting environment variable
14
+ try:
15
+ from TTS.api import TTS
16
+ print("✅ TTS imported successfully")
17
+ except Exception as e:
18
+ print(f"❌ TTS import failed: {e}")
19
+ TTS = None
20
 
 
 
 
21
  device = "cuda" if torch.cuda.is_available() else "cpu"
22
  print(f"Using device: {device}")
23
 
24
+ # Initialize TTS model
25
+ tts_model = None
26
+ try:
27
+ if TTS:
28
+ # Use a simpler, more reliable model
29
+ tts_model = TTS("tts_models/en/ljspeech/tacotron2-DDC").to(device)
30
+ print("✅ TTS model loaded successfully")
31
+ except Exception as e:
32
+ print(f"❌ Model loading failed: {e}")
 
 
 
 
 
 
 
 
33
 
34
+ def clone_voice_simple(text, speaker_audio_path=None):
35
  """
36
+ Simple text-to-speech (works reliably)
37
  """
38
  try:
39
+ if not tts_model:
40
+ return None, "❌ TTS model not loaded"
 
 
 
 
 
41
 
42
+ if not text or not text.strip():
43
+ return None, "❌ Please provide text"
 
 
 
 
 
 
 
 
 
44
 
45
+ start_time = time.time()
 
 
 
 
 
 
46
 
47
+ # Create temporary output file
48
  with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp_file:
49
  output_path = tmp_file.name
50
+
51
+ # Generate speech
52
+ tts_model.tts_to_file(text=text, file_path=output_path)
 
53
 
54
  processing_time = time.time() - start_time
55
  status = f"✅ Generated in {processing_time:.2f} seconds"
 
59
  except Exception as e:
60
  return None, f"❌ Error: {str(e)}"
61
 
62
+ def clone_voice_advanced(text, speaker_audio_path):
63
  """
64
+ Advanced voice cloning with speaker reference
65
  """
66
  try:
67
+ if not tts_model:
68
+ return None, "❌ TTS model not loaded"
69
 
70
+ if not text or not text.strip():
71
+ return None, "❌ Please provide text"
72
 
73
+ if not speaker_audio_path:
74
+ # Fallback to simple TTS
75
+ return clone_voice_simple(text)
76
 
77
+ start_time = time.time()
 
 
 
 
 
 
 
78
 
79
+ # Try to use a voice cloning model
80
+ try:
81
+ # Reinitialize with voice cloning model
82
+ vc_model = TTS("tts_models/multilingual/multi-dataset/xtts_v2").to(device)
83
+
84
  with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp_file:
 
85
  output_path = tmp_file.name
86
 
87
+ # Clone voice
88
+ vc_model.tts_to_file(
89
+ text=text,
90
+ speaker_wav=speaker_audio_path,
91
+ language="en",
92
+ file_path=output_path
93
+ )
94
+
95
  processing_time = time.time() - start_time
96
+ status = f"✅ Voice cloned in {processing_time:.2f} seconds"
97
 
98
  return output_path, status
 
 
99
 
100
+ except Exception as clone_error:
101
+ print(f"Voice cloning failed: {clone_error}")
102
+ # Fallback to simple TTS
103
+ return clone_voice_simple(text)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
104
 
105
  except Exception as e:
106
+ return None, f"❌ Error: {str(e)}"
107
 
108
+ def generate_speech(text, speaker_audio, method="simple"):
109
  """
110
+ Main function to generate speech
111
  """
112
  if not text or not text.strip():
113
+ return None, "❌ Please enter some text to synthesize"
114
 
115
+ print(f"Generating speech with method: {method}")
 
 
 
 
116
 
117
+ if method == "advanced" and speaker_audio:
118
+ return clone_voice_advanced(text, speaker_audio)
 
 
 
 
 
 
 
 
 
119
  else:
120
+ return clone_voice_simple(text, speaker_audio)
121
 
122
  # Create Gradio Interface
123
  def create_interface():
124
+ # Check if models are working
125
+ model_status = " Ready" if tts_model else "❌ Model loading failed"
126
+
127
+ with gr.Blocks(title="🎤 Voice Cloning", theme=gr.themes.Soft()) as interface:
128
 
129
+ gr.Markdown(f"""
130
+ # 🎤 Voice Cloning & Text-to-Speech
131
 
132
+ **Status: {model_status}**
133
 
134
+ Simple and reliable voice synthesis using Coqui TTS.
 
 
 
 
135
  """)
136
 
137
  with gr.Row():
138
+ with gr.Column(scale=1):
139
  text_input = gr.Textbox(
140
  label="📝 Text to Speak",
141
+ placeholder="Enter the text you want to convert to speech...",
142
+ lines=4,
143
+ value="Hello! This is a test of text to speech conversion."
144
  )
145
 
146
  speaker_audio = gr.Audio(
147
+ label="🎤 Speaker Reference Audio (Optional)",
148
+ type="filepath",
149
+ info="Upload audio file for voice cloning"
150
  )
151
 
152
+ method_choice = gr.Radio(
153
+ choices=["simple", "advanced"],
154
+ value="simple",
155
+ label="🔧 Method",
156
+ info="Simple: Basic TTS | Advanced: Voice cloning (requires reference audio)"
157
  )
158
 
159
+ generate_btn = gr.Button("🚀 Generate Speech", variant="primary", size="lg")
160
 
161
+ with gr.Column(scale=1):
162
+ output_audio = gr.Audio(
163
+ label="🔊 Generated Speech",
164
+ type="filepath"
165
+ )
166
+
167
+ status_text = gr.Textbox(
168
+ label="📊 Status",
169
+ lines=3,
170
+ interactive=False
171
+ )
172
 
173
+ # Examples
174
  gr.Examples(
175
  examples=[
176
+ ["Hello, how are you today?", None, "simple"],
177
+ ["This is a test of voice synthesis technology.", None, "simple"],
178
+ ["Thanks for using this voice cloning service!", None, "simple"],
179
  ],
180
+ inputs=[text_input, speaker_audio, method_choice],
181
+ outputs=[output_audio, status_text],
182
+ fn=generate_speech,
183
+ cache_examples=False
184
  )
185
 
186
+ # Event handler
187
  generate_btn.click(
188
+ fn=generate_speech,
189
+ inputs=[text_input, speaker_audio, method_choice],
190
+ outputs=[output_audio, status_text],
191
+ show_progress=True
192
  )
193
 
194
  gr.Markdown("""
195
+ ### 💡 Usage Tips:
196
+ - **Simple Mode**: Works with any text, generates basic TTS
197
+ - **Advanced Mode**: Upload reference audio for voice cloning
198
+ - **Best Results**: Use clear, 30+ second audio samples
199
+ - **Supported**: Multiple languages and voices
200
+
201
+ ### 🔧 Technical Details:
202
+ - Uses Coqui TTS models
203
+ - Automatic TOS agreement
204
+ - Fallback mechanisms included
205
+ - Processing time: 3-10 seconds
206
  """)
207
 
208
  return interface
 
212
  interface.launch(
213
  server_name="0.0.0.0",
214
  server_port=7860,
215
+ show_error=True,
216
+ share=False # Set to False for HF Spaces
217
  )