import gradio as gr import torch from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline import time import numpy as np # --- Configuration --- device = "cuda:0" if torch.cuda.is_available() else "cpu" print(f"Device set to use: {device}") stt_model_id = "openai/whisper-tiny" summarizer_model_id = "sshleifer/distilbart-cnn-6-6" SUMMARY_INTERVAL = 30.0 # Time between summarization updates # --- Load Models --- print("Loading Speech-to-Text (STT) model...") stt_model = AutoModelForSpeechSeq2Seq.from_pretrained(stt_model_id).to(device) processor = AutoProcessor.from_pretrained(stt_model_id) stt_pipeline = pipeline( "automatic-speech-recognition", model=stt_model, tokenizer=processor.tokenizer, feature_extractor=processor.feature_extractor, generate_kwargs={"max_new_tokens": 128}, # ✅ FIXED: Moved max_new_tokens chunk_length_s=30, batch_size=16, device=device, ) print("Loading Summarization model...") summarizer = pipeline("summarization", model=summarizer_model_id, device=device) def format_summary_as_bullets(summary_text): """Format summary into bullet points.""" if not summary_text: return "" sentences = summary_text.replace(". ", ".\n- ").split('\n') return "- " + "\n".join(sentences).strip() def process_audio_stream(new_chunk_tuple, accumulated_transcript_state, last_summary_time_state, current_summary_state): """Process streaming audio into transcript and summary.""" if new_chunk_tuple is None: return accumulated_transcript_state, current_summary_state, accumulated_transcript_state, last_summary_time_state, current_summary_state sample_rate, audio_chunk = new_chunk_tuple if audio_chunk is None or sample_rate is None or audio_chunk.size == 0: return accumulated_transcript_state, current_summary_state, accumulated_transcript_state, last_summary_time_state, current_summary_state # Convert to float32 if needed if audio_chunk.dtype != np.float32: audio_chunk = audio_chunk.astype(np.float32) / 32768.0 # Speech-to-text processing try: result = stt_pipeline({"sampling_rate": sample_rate, "raw": audio_chunk.copy()}) new_text = result["text"].strip() if result["text"] else "" except Exception as e: new_text = f"[Transcription Error: {e}]" updated_transcript = accumulated_transcript_state + " " + new_text if accumulated_transcript_state else new_text # Summarization every SUMMARY_INTERVAL current_time = time.time() new_summary = current_summary_state if updated_transcript and len(updated_transcript) > 50 and (current_time - last_summary_time_state > SUMMARY_INTERVAL): try: summary_result = summarizer(updated_transcript, max_length=150, min_length=30, do_sample=False) raw_summary = summary_result[0]['summary_text'] new_summary = format_summary_as_bullets(raw_summary) last_summary_time_state = current_time except Exception as e: return updated_transcript, f"[Summarization Error]\n{current_summary_state}", updated_transcript, last_summary_time_state, current_summary_state return updated_transcript, new_summary, updated_transcript, last_summary_time_state, new_summary # --- Gradio UI --- print("Creating Gradio interface...") with gr.Blocks() as demo: gr.Markdown("# 🎤 AI-Powered Meeting Notes with Google Meet Integration") with gr.Row(): with gr.Column(scale=1): gr.Audio(sources=["microphone"], streaming=True, label="🎙 Live Audio", type="numpy") gr.Image(sources=["webcam"], label="📷 Webcam", streaming=True) with gr.Column(scale=2): transcription_output = gr.Textbox(label="📝 Full Transcription", lines=10, interactive=False) summary_output = gr.Textbox(label=f"🔹 Summary (Updates ~{SUMMARY_INTERVAL}s)", lines=6, interactive=False) # Google Meet Button (opens in new tab) google_meet_button = gr.Button("Start Google Meet") google_meet_button.click(fn=lambda: None, inputs=[], outputs=[], _js="() => window.open('https://meet.google.com/new', '_blank')") # Streaming Audio Processing gr.Audio(sources=["microphone"], streaming=True).stream( fn=process_audio_stream, inputs=[gr.Audio(sources=["microphone"], streaming=True), gr.State(""), gr.State(0.0), gr.State("")], outputs=[transcription_output, summary_output, gr.State(""), gr.State(0.0), gr.State("")] ) # Clear button def clear_state(): return "", "", 0.0, "" clear_button = gr.Button("Clear Transcript & Summary") clear_button.click(fn=lambda: ("", "", "", 0.0, ""), inputs=[], outputs=[transcription_output, summary_output, gr.State(""), gr.State(0.0), gr.State("")]) print("Launching Gradio app...") demo.queue() demo.launch(debug=True, share=True)