Anupam007's picture
Update app.py
aa321b9 verified
import gradio as gr
import torch
from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
import time
import numpy as np
# --- Configuration ---
device = "cuda:0" if torch.cuda.is_available() else "cpu"
print(f"Device set to use: {device}")
stt_model_id = "openai/whisper-tiny"
summarizer_model_id = "sshleifer/distilbart-cnn-6-6"
SUMMARY_INTERVAL = 30.0 # Time between summarization updates
# --- Load Models ---
print("Loading Speech-to-Text (STT) model...")
stt_model = AutoModelForSpeechSeq2Seq.from_pretrained(stt_model_id).to(device)
processor = AutoProcessor.from_pretrained(stt_model_id)
stt_pipeline = pipeline(
"automatic-speech-recognition",
model=stt_model,
tokenizer=processor.tokenizer,
feature_extractor=processor.feature_extractor,
generate_kwargs={"max_new_tokens": 128}, # βœ… FIXED: Moved max_new_tokens
chunk_length_s=30,
batch_size=16,
device=device,
)
print("Loading Summarization model...")
summarizer = pipeline("summarization", model=summarizer_model_id, device=device)
def format_summary_as_bullets(summary_text):
"""Format summary into bullet points."""
if not summary_text:
return ""
sentences = summary_text.replace(". ", ".\n- ").split('\n')
return "- " + "\n".join(sentences).strip()
def process_audio_stream(new_chunk_tuple, accumulated_transcript_state, last_summary_time_state, current_summary_state):
"""Process streaming audio into transcript and summary."""
if new_chunk_tuple is None:
return accumulated_transcript_state, current_summary_state, accumulated_transcript_state, last_summary_time_state, current_summary_state
sample_rate, audio_chunk = new_chunk_tuple
if audio_chunk is None or sample_rate is None or audio_chunk.size == 0:
return accumulated_transcript_state, current_summary_state, accumulated_transcript_state, last_summary_time_state, current_summary_state
# Convert to float32 if needed
if audio_chunk.dtype != np.float32:
audio_chunk = audio_chunk.astype(np.float32) / 32768.0
# Speech-to-text processing
try:
result = stt_pipeline({"sampling_rate": sample_rate, "raw": audio_chunk.copy()})
new_text = result["text"].strip() if result["text"] else ""
except Exception as e:
new_text = f"[Transcription Error: {e}]"
updated_transcript = accumulated_transcript_state + " " + new_text if accumulated_transcript_state else new_text
# Summarization every SUMMARY_INTERVAL
current_time = time.time()
new_summary = current_summary_state
if updated_transcript and len(updated_transcript) > 50 and (current_time - last_summary_time_state > SUMMARY_INTERVAL):
try:
summary_result = summarizer(updated_transcript, max_length=150, min_length=30, do_sample=False)
raw_summary = summary_result[0]['summary_text']
new_summary = format_summary_as_bullets(raw_summary)
last_summary_time_state = current_time
except Exception as e:
return updated_transcript, f"[Summarization Error]\n{current_summary_state}", updated_transcript, last_summary_time_state, current_summary_state
return updated_transcript, new_summary, updated_transcript, last_summary_time_state, new_summary
# --- Gradio UI ---
print("Creating Gradio interface...")
with gr.Blocks() as demo:
gr.Markdown("# 🎀 AI-Powered Meeting Notes with Google Meet Integration")
with gr.Row():
with gr.Column(scale=1):
gr.Audio(sources=["microphone"], streaming=True, label="πŸŽ™ Live Audio", type="numpy")
gr.Image(sources=["webcam"], label="πŸ“· Webcam", streaming=True)
with gr.Column(scale=2):
transcription_output = gr.Textbox(label="πŸ“ Full Transcription", lines=10, interactive=False)
summary_output = gr.Textbox(label=f"πŸ”Ή Summary (Updates ~{SUMMARY_INTERVAL}s)", lines=6, interactive=False)
# Google Meet Button (opens in new tab)
google_meet_button = gr.Button("Start Google Meet")
google_meet_button.click(fn=lambda: None, inputs=[], outputs=[], _js="() => window.open('https://meet.google.com/new', '_blank')")
# Streaming Audio Processing
gr.Audio(sources=["microphone"], streaming=True).stream(
fn=process_audio_stream,
inputs=[gr.Audio(sources=["microphone"], streaming=True), gr.State(""), gr.State(0.0), gr.State("")],
outputs=[transcription_output, summary_output, gr.State(""), gr.State(0.0), gr.State("")]
)
# Clear button
def clear_state():
return "", "", 0.0, ""
clear_button = gr.Button("Clear Transcript & Summary")
clear_button.click(fn=lambda: ("", "", "", 0.0, ""), inputs=[], outputs=[transcription_output, summary_output, gr.State(""), gr.State(0.0), gr.State("")])
print("Launching Gradio app...")
demo.queue()
demo.launch(debug=True, share=True)