File size: 4,910 Bytes
1c244b4
 
 
 
 
 
 
 
aa321b9
1c244b4
 
 
aa321b9
1c244b4
b044e07
aa321b9
b044e07
 
aa321b9
 
 
 
 
 
 
 
 
 
 
b044e07
 
 
1c244b4
 
b044e07
1c244b4
 
 
 
 
b044e07
 
1c244b4
b044e07
 
1c244b4
b044e07
 
 
 
1c244b4
b044e07
 
 
1c244b4
 
 
 
 
b044e07
 
 
 
1c244b4
b044e07
 
1c244b4
 
b044e07
 
 
1c244b4
b044e07
1c244b4
b044e07
 
 
 
1c244b4
aa321b9
1c244b4
 
 
b044e07
 
 
1c244b4
b044e07
aa321b9
b044e07
 
 
 
 
 
 
1c244b4
b044e07
 
1c244b4
b044e07
 
 
 
1c244b4
b044e07
1c244b4
b044e07
1c244b4
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
import gradio as gr
import torch
from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
import time
import numpy as np

# --- Configuration ---
device = "cuda:0" if torch.cuda.is_available() else "cpu"
print(f"Device set to use: {device}")

stt_model_id = "openai/whisper-tiny"
summarizer_model_id = "sshleifer/distilbart-cnn-6-6"
SUMMARY_INTERVAL = 30.0  # Time between summarization updates

# --- Load Models ---
print("Loading Speech-to-Text (STT) model...")
stt_model = AutoModelForSpeechSeq2Seq.from_pretrained(stt_model_id).to(device)
processor = AutoProcessor.from_pretrained(stt_model_id)

stt_pipeline = pipeline(
    "automatic-speech-recognition",
    model=stt_model,
    tokenizer=processor.tokenizer,
    feature_extractor=processor.feature_extractor,
    generate_kwargs={"max_new_tokens": 128},  # βœ… FIXED: Moved max_new_tokens
    chunk_length_s=30,
    batch_size=16,
    device=device,
)

print("Loading Summarization model...")
summarizer = pipeline("summarization", model=summarizer_model_id, device=device)

def format_summary_as_bullets(summary_text):
    """Format summary into bullet points."""
    if not summary_text:
        return ""
    sentences = summary_text.replace(". ", ".\n- ").split('\n')
    return "- " + "\n".join(sentences).strip()

def process_audio_stream(new_chunk_tuple, accumulated_transcript_state, last_summary_time_state, current_summary_state):
    """Process streaming audio into transcript and summary."""
    if new_chunk_tuple is None:
        return accumulated_transcript_state, current_summary_state, accumulated_transcript_state, last_summary_time_state, current_summary_state

    sample_rate, audio_chunk = new_chunk_tuple
    if audio_chunk is None or sample_rate is None or audio_chunk.size == 0:
        return accumulated_transcript_state, current_summary_state, accumulated_transcript_state, last_summary_time_state, current_summary_state

    # Convert to float32 if needed
    if audio_chunk.dtype != np.float32:
        audio_chunk = audio_chunk.astype(np.float32) / 32768.0  

    # Speech-to-text processing
    try:
        result = stt_pipeline({"sampling_rate": sample_rate, "raw": audio_chunk.copy()})
        new_text = result["text"].strip() if result["text"] else ""
    except Exception as e:
        new_text = f"[Transcription Error: {e}]"

    updated_transcript = accumulated_transcript_state + " " + new_text if accumulated_transcript_state else new_text

    # Summarization every SUMMARY_INTERVAL
    current_time = time.time()
    new_summary = current_summary_state
    if updated_transcript and len(updated_transcript) > 50 and (current_time - last_summary_time_state > SUMMARY_INTERVAL):
        try:
            summary_result = summarizer(updated_transcript, max_length=150, min_length=30, do_sample=False)
            raw_summary = summary_result[0]['summary_text']
            new_summary = format_summary_as_bullets(raw_summary)
            last_summary_time_state = current_time  
        except Exception as e:
            return updated_transcript, f"[Summarization Error]\n{current_summary_state}", updated_transcript, last_summary_time_state, current_summary_state

    return updated_transcript, new_summary, updated_transcript, last_summary_time_state, new_summary

# --- Gradio UI ---
print("Creating Gradio interface...")
with gr.Blocks() as demo:
    gr.Markdown("# 🎀 AI-Powered Meeting Notes with Google Meet Integration")
    
    with gr.Row():
        with gr.Column(scale=1):
            gr.Audio(sources=["microphone"], streaming=True, label="πŸŽ™ Live Audio", type="numpy")
            gr.Image(sources=["webcam"], label="πŸ“· Webcam", streaming=True)  

        with gr.Column(scale=2):
            transcription_output = gr.Textbox(label="πŸ“ Full Transcription", lines=10, interactive=False)
            summary_output = gr.Textbox(label=f"πŸ”Ή Summary (Updates ~{SUMMARY_INTERVAL}s)", lines=6, interactive=False)

    # Google Meet Button (opens in new tab)
    google_meet_button = gr.Button("Start Google Meet")
    google_meet_button.click(fn=lambda: None, inputs=[], outputs=[], _js="() => window.open('https://meet.google.com/new', '_blank')")

    # Streaming Audio Processing
    gr.Audio(sources=["microphone"], streaming=True).stream(
        fn=process_audio_stream,
        inputs=[gr.Audio(sources=["microphone"], streaming=True), gr.State(""), gr.State(0.0), gr.State("")],
        outputs=[transcription_output, summary_output, gr.State(""), gr.State(0.0), gr.State("")]
    )

    # Clear button
    def clear_state():
        return "", "", 0.0, ""
    clear_button = gr.Button("Clear Transcript & Summary")
    clear_button.click(fn=lambda: ("", "", "", 0.0, ""), inputs=[], outputs=[transcription_output, summary_output, gr.State(""), gr.State(0.0), gr.State("")])

print("Launching Gradio app...")
demo.queue()
demo.launch(debug=True, share=True)