Anupam007 commited on
Commit
1c244b4
Β·
verified Β·
1 Parent(s): 07131a9

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +110 -0
app.py ADDED
@@ -0,0 +1,110 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import torch
3
+ from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
4
+ import time
5
+ import numpy as np
6
+ import soundfile as sf
7
+
8
+ # --- Configuration ---
9
+ device = "cuda:0" if torch.cuda.is_available() else "cpu"
10
+ torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
11
+ print(f"Using device: {device}")
12
+
13
+ # STT Model
14
+ stt_model_id = "openai/whisper-tiny"
15
+ stt_model = AutoModelForSpeechSeq2Seq.from_pretrained(
16
+ stt_model_id, torch_dtype=torch_dtype, low_cpu_mem_usage=True, use_safetensors=True
17
+ )
18
+ stt_model.to(device)
19
+ processor = AutoProcessor.from_pretrained(stt_model_id)
20
+ stt_pipeline = pipeline(
21
+ "automatic-speech-recognition",
22
+ model=stt_model,
23
+ tokenizer=processor.tokenizer,
24
+ feature_extractor=processor.feature_extractor,
25
+ max_new_tokens=128,
26
+ chunk_length_s=30,
27
+ batch_size=16,
28
+ torch_dtype=torch_dtype,
29
+ device=device,
30
+ )
31
+
32
+ # Summarization Model
33
+ summarizer_model_id = "sshleifer/distilbart-cnn-6-6"
34
+ summarizer = pipeline("summarization", model=summarizer_model_id, device=device)
35
+
36
+ SUMMARY_INTERVAL = 30.0
37
+
38
+ def format_summary_as_bullets(summary_text):
39
+ """Formats a summary into bullet points."""
40
+ if not summary_text:
41
+ return ""
42
+ sentences = summary_text.replace(". ", ".\n- ").split('\n')
43
+ return "- " + "\n".join(sentences).strip()
44
+
45
+ def process_audio_stream(new_chunk_tuple, transcript_state, last_summary_time, summary_state):
46
+ if new_chunk_tuple is None:
47
+ return transcript_state, summary_state, transcript_state, last_summary_time, summary_state
48
+
49
+ sample_rate, audio_chunk = new_chunk_tuple
50
+ if audio_chunk is None or audio_chunk.size == 0:
51
+ return transcript_state, summary_state, transcript_state, last_summary_time, summary_state
52
+
53
+ if audio_chunk.dtype != np.float32:
54
+ audio_chunk = audio_chunk.astype(np.float32) / 32768.0
55
+
56
+ new_text = ""
57
+ try:
58
+ result = stt_pipeline({"sampling_rate": sample_rate, "raw": audio_chunk.copy()})
59
+ new_text = result["text"].strip() if result["text"] else ""
60
+ except Exception as e:
61
+ new_text = f"[Transcription Error: {e}]"
62
+
63
+ updated_transcript = transcript_state + (" " + new_text if transcript_state else new_text)
64
+
65
+ current_time = time.time()
66
+ new_summary = summary_state
67
+ updated_last_summary_time = last_summary_time
68
+
69
+ if updated_transcript and len(updated_transcript) > 50 and (current_time - last_summary_time > SUMMARY_INTERVAL):
70
+ try:
71
+ summary_result = summarizer(updated_transcript, max_length=150, min_length=30, do_sample=False)
72
+ if summary_result and isinstance(summary_result, list):
73
+ raw_summary = summary_result[0]['summary_text']
74
+ new_summary = format_summary_as_bullets(raw_summary)
75
+ updated_last_summary_time = current_time
76
+ except Exception as e:
77
+ return updated_transcript, f"[Summarization Error]\n\n{summary_state}", updated_transcript, last_summary_time, summary_state
78
+
79
+ return updated_transcript, new_summary, updated_transcript, updated_last_summary_time, new_summary
80
+
81
+ # --- Gradio Interface ---
82
+ with gr.Blocks() as demo:
83
+ gr.Markdown("# Real-Time Meeting Notes with Google Meet")
84
+ gr.Markdown("Click the button below to start a Google Meet session.")
85
+
86
+ google_meet_button = gr.Markdown("### [Start Google Meet](https://meet.google.com/new){target=_blank}")
87
+
88
+ transcript_state = gr.State("")
89
+ last_summary_time = gr.State(0.0)
90
+ summary_state = gr.State("")
91
+
92
+ with gr.Row():
93
+ with gr.Column(scale=1):
94
+ audio_stream = gr.Audio(sources=["microphone"], streaming=True, label="Live Microphone Input", type="numpy")
95
+
96
+ with gr.Column(scale=2):
97
+ transcription_output = gr.Textbox(label="Full Transcription", lines=15, interactive=False)
98
+ summary_output = gr.Textbox(label=f"Bullet Point Summary (Updates ~every {SUMMARY_INTERVAL}s)", lines=10, interactive=False)
99
+
100
+ audio_stream.stream(
101
+ fn=process_audio_stream,
102
+ inputs=[audio_stream, transcript_state, last_summary_time, summary_state],
103
+ outputs=[transcription_output, summary_output, transcript_state, last_summary_time, summary_state],
104
+ )
105
+
106
+ clear_button = gr.Button("Clear Transcript & Summary")
107
+ clear_button.click(fn=lambda: ("", "", 0.0, ""), inputs=[], outputs=[transcription_output, summary_output, transcript_state, last_summary_time, summary_state])
108
+
109
+ demo.queue()
110
+ demo.launch(debug=True, share=True)