Spaces:

Anupam007
/

google-meet-transcriber

Runtime error

App Files Files Community

Anupam007 commited on Mar 27

Commit

b044e07

verified ·

1 Parent(s): 5500968

Update app.py

Browse files

Files changed (1) hide show

app.py +57 -64

app.py CHANGED Viewed

@@ -4,107 +4,100 @@ from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
 import time
 import numpy as np
 import soundfile as sf
 # --- Configuration ---
 device = "cuda:0" if torch.cuda.is_available() else "cpu"
 torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
 print(f"Using device: {device}")
-# STT Model
 stt_model_id = "openai/whisper-tiny"
-stt_model = AutoModelForSpeechSeq2Seq.from_pretrained(
-    stt_model_id, torch_dtype=torch_dtype, low_cpu_mem_usage=True, use_safetensors=True
-)
-stt_model.to(device)
-processor = AutoProcessor.from_pretrained(stt_model_id)
-stt_pipeline = pipeline(
-    "automatic-speech-recognition",
-    model=stt_model,
-    tokenizer=processor.tokenizer,
-    feature_extractor=processor.feature_extractor,
-    max_new_tokens=128,
-    chunk_length_s=30,
-    batch_size=16,
-    torch_dtype=torch_dtype,
-    device=device,
-)
-# Summarization Model
 summarizer_model_id = "sshleifer/distilbart-cnn-6-6"
-summarizer = pipeline("summarization", model=summarizer_model_id, device=device)
-SUMMARY_INTERVAL = 30.0
 def format_summary_as_bullets(summary_text):
-    """Formats a summary into bullet points."""
     if not summary_text:
         return ""
     sentences = summary_text.replace(". ", ".\n- ").split('\n')
     return "- " + "\n".join(sentences).strip()
-def process_audio_stream(new_chunk_tuple, transcript_state, last_summary_time, summary_state):
     if new_chunk_tuple is None:
-        return transcript_state, summary_state, transcript_state, last_summary_time, summary_state
     sample_rate, audio_chunk = new_chunk_tuple
-    if audio_chunk is None or audio_chunk.size == 0:
-        return transcript_state, summary_state, transcript_state, last_summary_time, summary_state
     if audio_chunk.dtype != np.float32:
-        audio_chunk = audio_chunk.astype(np.float32) / 32768.0
-    new_text = ""
     try:
         result = stt_pipeline({"sampling_rate": sample_rate, "raw": audio_chunk.copy()})
         new_text = result["text"].strip() if result["text"] else ""
     except Exception as e:
         new_text = f"[Transcription Error: {e}]"
-    updated_transcript = transcript_state + (" " + new_text if transcript_state else new_text)
     current_time = time.time()
-    new_summary = summary_state
-    updated_last_summary_time = last_summary_time
-    if updated_transcript and len(updated_transcript) > 50 and (current_time - last_summary_time > SUMMARY_INTERVAL):
         try:
             summary_result = summarizer(updated_transcript, max_length=150, min_length=30, do_sample=False)
-            if summary_result and isinstance(summary_result, list):
-                raw_summary = summary_result[0]['summary_text']
-                new_summary = format_summary_as_bullets(raw_summary)
-                updated_last_summary_time = current_time
         except Exception as e:
-            return updated_transcript, f"[Summarization Error]\n\n{summary_state}", updated_transcript, last_summary_time, summary_state
-    return updated_transcript, new_summary, updated_transcript, updated_last_summary_time, new_summary
-# --- Gradio Interface ---
 with gr.Blocks() as demo:
-    gr.Markdown("# Real-Time Meeting Notes with Google Meet")
-    gr.Markdown("Click the button below to start a Google Meet session.")
-    google_meet_button = gr.Markdown("### [Start Google Meet](https://meet.google.com/new){target=_blank}")
-    transcript_state = gr.State("")
-    last_summary_time = gr.State(0.0)
-    summary_state = gr.State("")
     with gr.Row():
         with gr.Column(scale=1):
-            audio_stream = gr.Audio(sources=["microphone"], streaming=True, label="Live Microphone Input", type="numpy")
         with gr.Column(scale=2):
-            transcription_output = gr.Textbox(label="Full Transcription", lines=15, interactive=False)
-            summary_output = gr.Textbox(label=f"Bullet Point Summary (Updates ~every {SUMMARY_INTERVAL}s)", lines=10, interactive=False)
-    audio_stream.stream(
         fn=process_audio_stream,
-        inputs=[audio_stream, transcript_state, last_summary_time, summary_state],
-        outputs=[transcription_output, summary_output, transcript_state, last_summary_time, summary_state],
     )
     clear_button = gr.Button("Clear Transcript & Summary")
-    clear_button.click(fn=lambda: ("", "", 0.0, ""), inputs=[], outputs=[transcription_output, summary_output, transcript_state, last_summary_time, summary_state])
 demo.queue()
 demo.launch(debug=True, share=True)

 import time
 import numpy as np
 import soundfile as sf
+import librosa
 # --- Configuration ---
 device = "cuda:0" if torch.cuda.is_available() else "cpu"
 torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
 print(f"Using device: {device}")
 stt_model_id = "openai/whisper-tiny"
 summarizer_model_id = "sshleifer/distilbart-cnn-6-6"
+SUMMARY_INTERVAL = 30.0
+# --- Load Models ---
+print("Loading STT model...")
+stt_model = AutoModelForSpeechSeq2Seq.from_pretrained(stt_model_id).to(device)
+processor = AutoProcessor.from_pretrained(stt_model_id)
+stt_pipeline = pipeline("automatic-speech-recognition", model=stt_model, tokenizer=processor.tokenizer, feature_extractor=processor.feature_extractor, max_new_tokens=128, chunk_length_s=30, batch_size=16, torch_dtype=torch_dtype, device=device)
+print("Loading Summarization model...")
+summarizer = pipeline("summarization", model=summarizer_model_id, device=device)
 def format_summary_as_bullets(summary_text):
+    """Format summary into bullet points."""
     if not summary_text:
         return ""
     sentences = summary_text.replace(". ", ".\n- ").split('\n')
     return "- " + "\n".join(sentences).strip()
+def process_audio_stream(new_chunk_tuple, accumulated_transcript_state, last_summary_time_state, current_summary_state):
+    """Process streaming audio into transcript and summary."""
     if new_chunk_tuple is None:
+        return accumulated_transcript_state, current_summary_state, accumulated_transcript_state, last_summary_time_state, current_summary_state
     sample_rate, audio_chunk = new_chunk_tuple
+    if audio_chunk is None or sample_rate is None or audio_chunk.size == 0:
+        return accumulated_transcript_state, current_summary_state, accumulated_transcript_state, last_summary_time_state, current_summary_state
+    # Convert to float32 if needed
     if audio_chunk.dtype != np.float32:
+        audio_chunk = audio_chunk.astype(np.float32) / 32768.0
+    # Speech-to-text processing
     try:
         result = stt_pipeline({"sampling_rate": sample_rate, "raw": audio_chunk.copy()})
         new_text = result["text"].strip() if result["text"] else ""
     except Exception as e:
         new_text = f"[Transcription Error: {e}]"
+    updated_transcript = accumulated_transcript_state + " " + new_text if accumulated_transcript_state else new_text
+    # Summarization every SUMMARY_INTERVAL
     current_time = time.time()
+    new_summary = current_summary_state
+    if updated_transcript and len(updated_transcript) > 50 and (current_time - last_summary_time_state > SUMMARY_INTERVAL):
         try:
             summary_result = summarizer(updated_transcript, max_length=150, min_length=30, do_sample=False)
+            raw_summary = summary_result[0]['summary_text']
+            new_summary = format_summary_as_bullets(raw_summary)
+            last_summary_time_state = current_time
         except Exception as e:
+            return updated_transcript, f"[Summarization Error]\n{current_summary_state}", updated_transcript, last_summary_time_state, current_summary_state
+    return updated_transcript, new_summary, updated_transcript, last_summary_time_state, new_summary
+# --- Gradio UI ---
+print("Creating Gradio interface...")
 with gr.Blocks() as demo:
+    gr.Markdown("# 🎤 Real-Time Meeting Notes with Google Meet Integration")
     with gr.Row():
         with gr.Column(scale=1):
+            gr.Audio(sources=["microphone"], streaming=True, label="🎙 Live Audio", type="numpy")
+            gr.Image(sources=["webcam"], label="📷 Webcam", streaming=True)
         with gr.Column(scale=2):
+            transcription_output = gr.Textbox(label="📝 Full Transcription", lines=10, interactive=False)
+            summary_output = gr.Textbox(label=f"🔹 Bullet Point Summary (Updates ~{SUMMARY_INTERVAL}s)", lines=6, interactive=False)
+    # Google Meet Button (opens in new tab)
+    google_meet_button = gr.Button("Start Google Meet")
+    google_meet_button.click(fn=lambda: None, inputs=[], outputs=[], _js="() => window.open('https://meet.google.com/new', '_blank')")
+    # Streaming Audio Processing
+    gr.Audio(sources=["microphone"], streaming=True).stream(
         fn=process_audio_stream,
+        inputs=[gr.Audio(sources=["microphone"], streaming=True), gr.State(""), gr.State(0.0), gr.State("")],
+        outputs=[transcription_output, summary_output, gr.State(""), gr.State(0.0), gr.State("")]
     )
+    # Clear button
+    def clear_state():
+        return "", "", 0.0, ""
     clear_button = gr.Button("Clear Transcript & Summary")
+    clear_button.click(fn=lambda: ("", "", "", 0.0, ""), inputs=[], outputs=[transcription_output, summary_output, gr.State(""), gr.State(0.0), gr.State("")])
+print("Launching Gradio app...")
 demo.queue()
 demo.launch(debug=True, share=True)