Anupam007 commited on
Commit
b044e07
Β·
verified Β·
1 Parent(s): 5500968

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +57 -64
app.py CHANGED
@@ -4,107 +4,100 @@ from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
4
  import time
5
  import numpy as np
6
  import soundfile as sf
 
7
 
8
  # --- Configuration ---
9
  device = "cuda:0" if torch.cuda.is_available() else "cpu"
10
  torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
11
  print(f"Using device: {device}")
12
 
13
- # STT Model
14
  stt_model_id = "openai/whisper-tiny"
15
- stt_model = AutoModelForSpeechSeq2Seq.from_pretrained(
16
- stt_model_id, torch_dtype=torch_dtype, low_cpu_mem_usage=True, use_safetensors=True
17
- )
18
- stt_model.to(device)
19
- processor = AutoProcessor.from_pretrained(stt_model_id)
20
- stt_pipeline = pipeline(
21
- "automatic-speech-recognition",
22
- model=stt_model,
23
- tokenizer=processor.tokenizer,
24
- feature_extractor=processor.feature_extractor,
25
- max_new_tokens=128,
26
- chunk_length_s=30,
27
- batch_size=16,
28
- torch_dtype=torch_dtype,
29
- device=device,
30
- )
31
-
32
- # Summarization Model
33
  summarizer_model_id = "sshleifer/distilbart-cnn-6-6"
34
- summarizer = pipeline("summarization", model=summarizer_model_id, device=device)
35
 
36
- SUMMARY_INTERVAL = 30.0
 
 
 
 
 
 
 
37
 
38
  def format_summary_as_bullets(summary_text):
39
- """Formats a summary into bullet points."""
40
  if not summary_text:
41
  return ""
42
  sentences = summary_text.replace(". ", ".\n- ").split('\n')
43
  return "- " + "\n".join(sentences).strip()
44
 
45
- def process_audio_stream(new_chunk_tuple, transcript_state, last_summary_time, summary_state):
 
46
  if new_chunk_tuple is None:
47
- return transcript_state, summary_state, transcript_state, last_summary_time, summary_state
48
-
49
  sample_rate, audio_chunk = new_chunk_tuple
50
- if audio_chunk is None or audio_chunk.size == 0:
51
- return transcript_state, summary_state, transcript_state, last_summary_time, summary_state
52
-
 
53
  if audio_chunk.dtype != np.float32:
54
- audio_chunk = audio_chunk.astype(np.float32) / 32768.0
55
-
56
- new_text = ""
57
  try:
58
  result = stt_pipeline({"sampling_rate": sample_rate, "raw": audio_chunk.copy()})
59
  new_text = result["text"].strip() if result["text"] else ""
60
  except Exception as e:
61
  new_text = f"[Transcription Error: {e}]"
62
-
63
- updated_transcript = transcript_state + (" " + new_text if transcript_state else new_text)
64
-
 
65
  current_time = time.time()
66
- new_summary = summary_state
67
- updated_last_summary_time = last_summary_time
68
-
69
- if updated_transcript and len(updated_transcript) > 50 and (current_time - last_summary_time > SUMMARY_INTERVAL):
70
  try:
71
  summary_result = summarizer(updated_transcript, max_length=150, min_length=30, do_sample=False)
72
- if summary_result and isinstance(summary_result, list):
73
- raw_summary = summary_result[0]['summary_text']
74
- new_summary = format_summary_as_bullets(raw_summary)
75
- updated_last_summary_time = current_time
76
  except Exception as e:
77
- return updated_transcript, f"[Summarization Error]\n\n{summary_state}", updated_transcript, last_summary_time, summary_state
78
-
79
- return updated_transcript, new_summary, updated_transcript, updated_last_summary_time, new_summary
80
 
81
- # --- Gradio Interface ---
 
 
 
82
  with gr.Blocks() as demo:
83
- gr.Markdown("# Real-Time Meeting Notes with Google Meet")
84
- gr.Markdown("Click the button below to start a Google Meet session.")
85
-
86
- google_meet_button = gr.Markdown("### [Start Google Meet](https://meet.google.com/new){target=_blank}")
87
-
88
- transcript_state = gr.State("")
89
- last_summary_time = gr.State(0.0)
90
- summary_state = gr.State("")
91
 
92
  with gr.Row():
93
  with gr.Column(scale=1):
94
- audio_stream = gr.Audio(sources=["microphone"], streaming=True, label="Live Microphone Input", type="numpy")
95
-
 
96
  with gr.Column(scale=2):
97
- transcription_output = gr.Textbox(label="Full Transcription", lines=15, interactive=False)
98
- summary_output = gr.Textbox(label=f"Bullet Point Summary (Updates ~every {SUMMARY_INTERVAL}s)", lines=10, interactive=False)
99
-
100
- audio_stream.stream(
 
 
 
 
 
101
  fn=process_audio_stream,
102
- inputs=[audio_stream, transcript_state, last_summary_time, summary_state],
103
- outputs=[transcription_output, summary_output, transcript_state, last_summary_time, summary_state],
104
  )
105
-
 
 
 
106
  clear_button = gr.Button("Clear Transcript & Summary")
107
- clear_button.click(fn=lambda: ("", "", 0.0, ""), inputs=[], outputs=[transcription_output, summary_output, transcript_state, last_summary_time, summary_state])
108
 
 
109
  demo.queue()
110
  demo.launch(debug=True, share=True)
 
4
  import time
5
  import numpy as np
6
  import soundfile as sf
7
+ import librosa
8
 
9
  # --- Configuration ---
10
  device = "cuda:0" if torch.cuda.is_available() else "cpu"
11
  torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
12
  print(f"Using device: {device}")
13
 
 
14
  stt_model_id = "openai/whisper-tiny"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
15
  summarizer_model_id = "sshleifer/distilbart-cnn-6-6"
16
+ SUMMARY_INTERVAL = 30.0
17
 
18
+ # --- Load Models ---
19
+ print("Loading STT model...")
20
+ stt_model = AutoModelForSpeechSeq2Seq.from_pretrained(stt_model_id).to(device)
21
+ processor = AutoProcessor.from_pretrained(stt_model_id)
22
+ stt_pipeline = pipeline("automatic-speech-recognition", model=stt_model, tokenizer=processor.tokenizer, feature_extractor=processor.feature_extractor, max_new_tokens=128, chunk_length_s=30, batch_size=16, torch_dtype=torch_dtype, device=device)
23
+
24
+ print("Loading Summarization model...")
25
+ summarizer = pipeline("summarization", model=summarizer_model_id, device=device)
26
 
27
  def format_summary_as_bullets(summary_text):
28
+ """Format summary into bullet points."""
29
  if not summary_text:
30
  return ""
31
  sentences = summary_text.replace(". ", ".\n- ").split('\n')
32
  return "- " + "\n".join(sentences).strip()
33
 
34
+ def process_audio_stream(new_chunk_tuple, accumulated_transcript_state, last_summary_time_state, current_summary_state):
35
+ """Process streaming audio into transcript and summary."""
36
  if new_chunk_tuple is None:
37
+ return accumulated_transcript_state, current_summary_state, accumulated_transcript_state, last_summary_time_state, current_summary_state
38
+
39
  sample_rate, audio_chunk = new_chunk_tuple
40
+ if audio_chunk is None or sample_rate is None or audio_chunk.size == 0:
41
+ return accumulated_transcript_state, current_summary_state, accumulated_transcript_state, last_summary_time_state, current_summary_state
42
+
43
+ # Convert to float32 if needed
44
  if audio_chunk.dtype != np.float32:
45
+ audio_chunk = audio_chunk.astype(np.float32) / 32768.0
46
+
47
+ # Speech-to-text processing
48
  try:
49
  result = stt_pipeline({"sampling_rate": sample_rate, "raw": audio_chunk.copy()})
50
  new_text = result["text"].strip() if result["text"] else ""
51
  except Exception as e:
52
  new_text = f"[Transcription Error: {e}]"
53
+
54
+ updated_transcript = accumulated_transcript_state + " " + new_text if accumulated_transcript_state else new_text
55
+
56
+ # Summarization every SUMMARY_INTERVAL
57
  current_time = time.time()
58
+ new_summary = current_summary_state
59
+ if updated_transcript and len(updated_transcript) > 50 and (current_time - last_summary_time_state > SUMMARY_INTERVAL):
 
 
60
  try:
61
  summary_result = summarizer(updated_transcript, max_length=150, min_length=30, do_sample=False)
62
+ raw_summary = summary_result[0]['summary_text']
63
+ new_summary = format_summary_as_bullets(raw_summary)
64
+ last_summary_time_state = current_time
 
65
  except Exception as e:
66
+ return updated_transcript, f"[Summarization Error]\n{current_summary_state}", updated_transcript, last_summary_time_state, current_summary_state
 
 
67
 
68
+ return updated_transcript, new_summary, updated_transcript, last_summary_time_state, new_summary
69
+
70
+ # --- Gradio UI ---
71
+ print("Creating Gradio interface...")
72
  with gr.Blocks() as demo:
73
+ gr.Markdown("# 🎀 Real-Time Meeting Notes with Google Meet Integration")
 
 
 
 
 
 
 
74
 
75
  with gr.Row():
76
  with gr.Column(scale=1):
77
+ gr.Audio(sources=["microphone"], streaming=True, label="πŸŽ™ Live Audio", type="numpy")
78
+ gr.Image(sources=["webcam"], label="πŸ“· Webcam", streaming=True)
79
+
80
  with gr.Column(scale=2):
81
+ transcription_output = gr.Textbox(label="πŸ“ Full Transcription", lines=10, interactive=False)
82
+ summary_output = gr.Textbox(label=f"πŸ”Ή Bullet Point Summary (Updates ~{SUMMARY_INTERVAL}s)", lines=6, interactive=False)
83
+
84
+ # Google Meet Button (opens in new tab)
85
+ google_meet_button = gr.Button("Start Google Meet")
86
+ google_meet_button.click(fn=lambda: None, inputs=[], outputs=[], _js="() => window.open('https://meet.google.com/new', '_blank')")
87
+
88
+ # Streaming Audio Processing
89
+ gr.Audio(sources=["microphone"], streaming=True).stream(
90
  fn=process_audio_stream,
91
+ inputs=[gr.Audio(sources=["microphone"], streaming=True), gr.State(""), gr.State(0.0), gr.State("")],
92
+ outputs=[transcription_output, summary_output, gr.State(""), gr.State(0.0), gr.State("")]
93
  )
94
+
95
+ # Clear button
96
+ def clear_state():
97
+ return "", "", 0.0, ""
98
  clear_button = gr.Button("Clear Transcript & Summary")
99
+ clear_button.click(fn=lambda: ("", "", "", 0.0, ""), inputs=[], outputs=[transcription_output, summary_output, gr.State(""), gr.State(0.0), gr.State("")])
100
 
101
+ print("Launching Gradio app...")
102
  demo.queue()
103
  demo.launch(debug=True, share=True)