Spaces:

camanalo1
/

MyAlexa

Build error

App Files Files Community

camanalo1 commited on May 1, 2024

Commit

0f4f655

verified ·

1 Parent(s): da3b26a

Update app.py

Browse files

Files changed (1) hide show

app.py +49 -11

app.py CHANGED Viewed

@@ -1,12 +1,50 @@
 import gradio as gr
-from transformers import pipeline
-pipe = pipeline(task="image-classification",
-                # model that can do 22k-category classification
-                model="microsoft/beit-base-patch16-224-pt22k-ft22k")
-gr.Interface.from_pipeline(pipe,
-                           title="22k Image Classification",
-                           description="Object Recognition using Microsoft BEIT",
-                           examples = ['wonder_cat.jpg', 'aki_dog.jpg',],
-                           article = "Author: <a href=\"https://huggingface.co/rowel\">Rowel Atienza</a>",
-                           ).launch()

 import gradio as gr
+from transformers import pipeline, VitsTokenizer, VitsModel, set_seed
+import numpy as np
+import torch
+import io
+import soundfile as sf
+# Initialize ASR pipeline
+transcriber = pipeline("automatic-speech-recognition", model="facebook/s2t-small-librispeech-asr")
+# Initialize LLM pipeline
+generator = pipeline("text-generation", model="gpt2")
+# Initialize TTS tokenizer and model
+tokenizer = VitsTokenizer.from_pretrained("facebook/mms-tts-eng")
+model = VitsModel.from_pretrained("facebook/mms-tts-eng")
+def transcribe_and_generate_audio(audio):
+    sr, y = audio
+    y = y.astype(np.float32)
+    y /= np.max(np.abs(y))
+    # Transcribe audio
+    asr_output = transcriber({"sampling_rate": sr, "raw": y})["text"]
+    # Generate text based on ASR output
+    generated_text = generator(asr_output, max_length=100, num_return_sequences=1)[0]['generated_text']
+    # Generate audio from text
+    inputs = tokenizer(text=generated_text, return_tensors="pt")
+    set_seed(555)
+    with torch.no_grad():
+        outputs = model(**inputs)
+    waveform = outputs.waveform[0]
+    waveform_path = "output.wav"
+    sf.write(waveform_path, waveform.numpy(), 22050, format='wav')
+    return waveform_path
+# Define Gradio interface
+audio_input = gr.Interface(
+    transcribe_and_generate_audio,
+    gr.Audio(sources=["microphone"], label="Speak Here"),
+    "audio",
+    title="ASR -> LLM -> TTS",
+    description="Speak into the microphone and hear the generated audio."
+)
+# Launch the interface
+audio_input.launch()