Heng2004 commited on
Commit
1c2d1e9
·
verified ·
1 Parent(s): 7bcfb76

Update model_utils.py

Browse files
Files changed (1) hide show
  1. model_utils.py +60 -22
model_utils.py CHANGED
@@ -255,35 +255,75 @@ def retrieve_context(question: str, max_entries: int = MAX_CONTEXT_ENTRIES) -> s
255
  # -----------------------------
256
  # Glossary-based answering
257
  # -----------------------------
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
258
  def answer_from_glossary(message: str) -> Optional[str]:
259
  """
260
  Try to answer using the glossary index.
261
- Priority 1: Exact string match of the Term inside the user's message.
262
- Priority 2: Vector embedding match (if confidence is high).
263
  """
264
  if not getattr(qa_store, "GLOSSARY", None):
265
  return None
266
 
267
- # --- FIX START: Check for EXACT term match first ---
268
- # This fixes the issue where "What is Science" matches "Pollution"
269
- # just because "Pollution" definition contains the word "Science".
270
 
271
- normalized_msg = message.lower().strip()
272
 
273
- for item in qa_store.GLOSSARY:
274
- term = item.get("term", "").lower().strip()
275
- # If the specific term appears in the message (e.g. "Science" in "What is Science?")
276
- if term and term in normalized_msg:
277
- # Optional: Check if the message is SHORT (so we don't trigger on long sentences accidentally)
278
- if len(normalized_msg) < len(term) + 20:
279
- definition = item.get("definition", "").strip()
280
- example = item.get("example", "").strip()
281
- if example:
282
- return f"{definition} ຕົວຢ່າງ: {example}"
283
- return definition
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
284
  # --- FIX END ---
285
 
286
- # If no exact text match, proceed to Vector Similarity (the old code)
287
  if qa_store.GLOSSARY_EMBEDDINGS is None:
288
  return None
289
 
@@ -297,8 +337,7 @@ def answer_from_glossary(message: str) -> Optional[str]:
297
  best_idx = int(np.argmax(sims))
298
  best_sim = float(sims[best_idx])
299
 
300
- # INCREASE THRESHOLD:
301
- # Raised from 0.55 to 0.65 to prevent weak matches (like Science matching Pollution)
302
  if best_sim < 0.65:
303
  return None
304
 
@@ -308,8 +347,7 @@ def answer_from_glossary(message: str) -> Optional[str]:
308
 
309
  if example:
310
  return f"{definition} ຕົວຢ່າງ: {example}"
311
- else:
312
- return definition
313
 
314
 
315
  # -----------------------------
 
255
  # -----------------------------
256
  # Glossary-based answering
257
  # -----------------------------
258
+
259
+ def normalize_lao_text(text: str) -> str:
260
+ """
261
+ Clean Lao text for accurate matching.
262
+ Removes punctuation and extra spaces.
263
+ """
264
+ if not text:
265
+ return ""
266
+
267
+ # 1. Lowercase
268
+ text = text.lower().strip()
269
+
270
+ # 2. Remove punctuation (Using the safe single-quote format)
271
+ text = re.sub(r'[?.!,;։:\'\""“”‘’]', "", text)
272
+
273
+ # 3. Collapse multiple spaces into one (THIS WAS MISSING)
274
+ text = re.sub(r"\s+", " ", text)
275
+
276
+ return text.strip()
277
+
278
  def answer_from_glossary(message: str) -> Optional[str]:
279
  """
280
  Try to answer using the glossary index.
281
+ Tier 1: Exact/Substring match (Sorted by Length to fix overlap bugs).
282
+ Tier 2: Vector embedding match (Fallback).
283
  """
284
  if not getattr(qa_store, "GLOSSARY", None):
285
  return None
286
 
287
+ norm_msg = normalize_lao_text(message)
 
 
288
 
289
+ # --- FIX START: Sort by Length + Exact Match ---
290
 
291
+ # 1. Sort glossary terms by length (Longest first)
292
+ # This ensures we match "ນັກວິທະຍາສາດ" (14 chars) BEFORE "ວິທະຍາສາດ" (11 chars)
293
+ sorted_glossary = sorted(
294
+ qa_store.GLOSSARY,
295
+ key=lambda x: len(normalize_lao_text(x.get("term", ""))),
296
+ reverse=True
297
+ )
298
+
299
+ for item in sorted_glossary:
300
+ term_raw = item.get("term", "")
301
+ norm_term = normalize_lao_text(term_raw)
302
+
303
+ if not norm_term:
304
+ continue
305
+
306
+ # Condition A: EXACT Match (Perfect precision)
307
+ # Example: User types "ນັກວິທະຍາສາດ"
308
+ is_exact = (norm_msg == norm_term)
309
+
310
+ # Condition B: Substring Match (High precision for questions)
311
+ # Example: User types "ນັກວິທະຍາສາດ ແມ່ນຫຍັງ"
312
+ # We enforce a length check so "Science" doesn't match a huge paragraph about Pollution.
313
+ is_substring = (norm_term in norm_msg) and (len(norm_msg) < len(norm_term) + 20)
314
+
315
+ if is_exact or is_substring:
316
+ definition = item.get("definition", "").strip()
317
+ example = item.get("example", "").strip()
318
+
319
+ # Return the result immediately once the longest match is found
320
+ if example:
321
+ return f"{definition} ຕົວຢ່າງ: {example}"
322
+ return definition
323
+
324
  # --- FIX END ---
325
 
326
+ # If no text match, proceed to Vector Similarity (Tier 2)
327
  if qa_store.GLOSSARY_EMBEDDINGS is None:
328
  return None
329
 
 
337
  best_idx = int(np.argmax(sims))
338
  best_sim = float(sims[best_idx])
339
 
340
+ # Threshold 0.65 to prevent weak matches
 
341
  if best_sim < 0.65:
342
  return None
343
 
 
347
 
348
  if example:
349
  return f"{definition} ຕົວຢ່າງ: {example}"
350
+ return definition
 
351
 
352
 
353
  # -----------------------------