Spaces:
Runtime error
Runtime error
File size: 4,910 Bytes
1c244b4 aa321b9 1c244b4 aa321b9 1c244b4 b044e07 aa321b9 b044e07 aa321b9 b044e07 1c244b4 b044e07 1c244b4 b044e07 1c244b4 b044e07 1c244b4 b044e07 1c244b4 b044e07 1c244b4 b044e07 1c244b4 b044e07 1c244b4 b044e07 1c244b4 b044e07 1c244b4 b044e07 1c244b4 aa321b9 1c244b4 b044e07 1c244b4 b044e07 aa321b9 b044e07 1c244b4 b044e07 1c244b4 b044e07 1c244b4 b044e07 1c244b4 b044e07 1c244b4 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 |
import gradio as gr
import torch
from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
import time
import numpy as np
# --- Configuration ---
device = "cuda:0" if torch.cuda.is_available() else "cpu"
print(f"Device set to use: {device}")
stt_model_id = "openai/whisper-tiny"
summarizer_model_id = "sshleifer/distilbart-cnn-6-6"
SUMMARY_INTERVAL = 30.0 # Time between summarization updates
# --- Load Models ---
print("Loading Speech-to-Text (STT) model...")
stt_model = AutoModelForSpeechSeq2Seq.from_pretrained(stt_model_id).to(device)
processor = AutoProcessor.from_pretrained(stt_model_id)
stt_pipeline = pipeline(
"automatic-speech-recognition",
model=stt_model,
tokenizer=processor.tokenizer,
feature_extractor=processor.feature_extractor,
generate_kwargs={"max_new_tokens": 128}, # β
FIXED: Moved max_new_tokens
chunk_length_s=30,
batch_size=16,
device=device,
)
print("Loading Summarization model...")
summarizer = pipeline("summarization", model=summarizer_model_id, device=device)
def format_summary_as_bullets(summary_text):
"""Format summary into bullet points."""
if not summary_text:
return ""
sentences = summary_text.replace(". ", ".\n- ").split('\n')
return "- " + "\n".join(sentences).strip()
def process_audio_stream(new_chunk_tuple, accumulated_transcript_state, last_summary_time_state, current_summary_state):
"""Process streaming audio into transcript and summary."""
if new_chunk_tuple is None:
return accumulated_transcript_state, current_summary_state, accumulated_transcript_state, last_summary_time_state, current_summary_state
sample_rate, audio_chunk = new_chunk_tuple
if audio_chunk is None or sample_rate is None or audio_chunk.size == 0:
return accumulated_transcript_state, current_summary_state, accumulated_transcript_state, last_summary_time_state, current_summary_state
# Convert to float32 if needed
if audio_chunk.dtype != np.float32:
audio_chunk = audio_chunk.astype(np.float32) / 32768.0
# Speech-to-text processing
try:
result = stt_pipeline({"sampling_rate": sample_rate, "raw": audio_chunk.copy()})
new_text = result["text"].strip() if result["text"] else ""
except Exception as e:
new_text = f"[Transcription Error: {e}]"
updated_transcript = accumulated_transcript_state + " " + new_text if accumulated_transcript_state else new_text
# Summarization every SUMMARY_INTERVAL
current_time = time.time()
new_summary = current_summary_state
if updated_transcript and len(updated_transcript) > 50 and (current_time - last_summary_time_state > SUMMARY_INTERVAL):
try:
summary_result = summarizer(updated_transcript, max_length=150, min_length=30, do_sample=False)
raw_summary = summary_result[0]['summary_text']
new_summary = format_summary_as_bullets(raw_summary)
last_summary_time_state = current_time
except Exception as e:
return updated_transcript, f"[Summarization Error]\n{current_summary_state}", updated_transcript, last_summary_time_state, current_summary_state
return updated_transcript, new_summary, updated_transcript, last_summary_time_state, new_summary
# --- Gradio UI ---
print("Creating Gradio interface...")
with gr.Blocks() as demo:
gr.Markdown("# π€ AI-Powered Meeting Notes with Google Meet Integration")
with gr.Row():
with gr.Column(scale=1):
gr.Audio(sources=["microphone"], streaming=True, label="π Live Audio", type="numpy")
gr.Image(sources=["webcam"], label="π· Webcam", streaming=True)
with gr.Column(scale=2):
transcription_output = gr.Textbox(label="π Full Transcription", lines=10, interactive=False)
summary_output = gr.Textbox(label=f"πΉ Summary (Updates ~{SUMMARY_INTERVAL}s)", lines=6, interactive=False)
# Google Meet Button (opens in new tab)
google_meet_button = gr.Button("Start Google Meet")
google_meet_button.click(fn=lambda: None, inputs=[], outputs=[], _js="() => window.open('https://meet.google.com/new', '_blank')")
# Streaming Audio Processing
gr.Audio(sources=["microphone"], streaming=True).stream(
fn=process_audio_stream,
inputs=[gr.Audio(sources=["microphone"], streaming=True), gr.State(""), gr.State(0.0), gr.State("")],
outputs=[transcription_output, summary_output, gr.State(""), gr.State(0.0), gr.State("")]
)
# Clear button
def clear_state():
return "", "", 0.0, ""
clear_button = gr.Button("Clear Transcript & Summary")
clear_button.click(fn=lambda: ("", "", "", 0.0, ""), inputs=[], outputs=[transcription_output, summary_output, gr.State(""), gr.State(0.0), gr.State("")])
print("Launching Gradio app...")
demo.queue()
demo.launch(debug=True, share=True)
|