Update model_utils.py
Browse files- model_utils.py +60 -22
model_utils.py
CHANGED
|
@@ -255,35 +255,75 @@ def retrieve_context(question: str, max_entries: int = MAX_CONTEXT_ENTRIES) -> s
|
|
| 255 |
# -----------------------------
|
| 256 |
# Glossary-based answering
|
| 257 |
# -----------------------------
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 258 |
def answer_from_glossary(message: str) -> Optional[str]:
|
| 259 |
"""
|
| 260 |
Try to answer using the glossary index.
|
| 261 |
-
|
| 262 |
-
|
| 263 |
"""
|
| 264 |
if not getattr(qa_store, "GLOSSARY", None):
|
| 265 |
return None
|
| 266 |
|
| 267 |
-
|
| 268 |
-
# This fixes the issue where "What is Science" matches "Pollution"
|
| 269 |
-
# just because "Pollution" definition contains the word "Science".
|
| 270 |
|
| 271 |
-
|
| 272 |
|
| 273 |
-
|
| 274 |
-
|
| 275 |
-
|
| 276 |
-
|
| 277 |
-
|
| 278 |
-
|
| 279 |
-
|
| 280 |
-
|
| 281 |
-
|
| 282 |
-
|
| 283 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 284 |
# --- FIX END ---
|
| 285 |
|
| 286 |
-
# If no
|
| 287 |
if qa_store.GLOSSARY_EMBEDDINGS is None:
|
| 288 |
return None
|
| 289 |
|
|
@@ -297,8 +337,7 @@ def answer_from_glossary(message: str) -> Optional[str]:
|
|
| 297 |
best_idx = int(np.argmax(sims))
|
| 298 |
best_sim = float(sims[best_idx])
|
| 299 |
|
| 300 |
-
#
|
| 301 |
-
# Raised from 0.55 to 0.65 to prevent weak matches (like Science matching Pollution)
|
| 302 |
if best_sim < 0.65:
|
| 303 |
return None
|
| 304 |
|
|
@@ -308,8 +347,7 @@ def answer_from_glossary(message: str) -> Optional[str]:
|
|
| 308 |
|
| 309 |
if example:
|
| 310 |
return f"{definition} ຕົວຢ່າງ: {example}"
|
| 311 |
-
|
| 312 |
-
return definition
|
| 313 |
|
| 314 |
|
| 315 |
# -----------------------------
|
|
|
|
| 255 |
# -----------------------------
|
| 256 |
# Glossary-based answering
|
| 257 |
# -----------------------------
|
| 258 |
+
|
| 259 |
+
def normalize_lao_text(text: str) -> str:
|
| 260 |
+
"""
|
| 261 |
+
Clean Lao text for accurate matching.
|
| 262 |
+
Removes punctuation and extra spaces.
|
| 263 |
+
"""
|
| 264 |
+
if not text:
|
| 265 |
+
return ""
|
| 266 |
+
|
| 267 |
+
# 1. Lowercase
|
| 268 |
+
text = text.lower().strip()
|
| 269 |
+
|
| 270 |
+
# 2. Remove punctuation (Using the safe single-quote format)
|
| 271 |
+
text = re.sub(r'[?.!,;։:\'\""“”‘’]', "", text)
|
| 272 |
+
|
| 273 |
+
# 3. Collapse multiple spaces into one (THIS WAS MISSING)
|
| 274 |
+
text = re.sub(r"\s+", " ", text)
|
| 275 |
+
|
| 276 |
+
return text.strip()
|
| 277 |
+
|
| 278 |
def answer_from_glossary(message: str) -> Optional[str]:
|
| 279 |
"""
|
| 280 |
Try to answer using the glossary index.
|
| 281 |
+
Tier 1: Exact/Substring match (Sorted by Length to fix overlap bugs).
|
| 282 |
+
Tier 2: Vector embedding match (Fallback).
|
| 283 |
"""
|
| 284 |
if not getattr(qa_store, "GLOSSARY", None):
|
| 285 |
return None
|
| 286 |
|
| 287 |
+
norm_msg = normalize_lao_text(message)
|
|
|
|
|
|
|
| 288 |
|
| 289 |
+
# --- FIX START: Sort by Length + Exact Match ---
|
| 290 |
|
| 291 |
+
# 1. Sort glossary terms by length (Longest first)
|
| 292 |
+
# This ensures we match "ນັກວິທະຍາສາດ" (14 chars) BEFORE "ວິທະຍາສາດ" (11 chars)
|
| 293 |
+
sorted_glossary = sorted(
|
| 294 |
+
qa_store.GLOSSARY,
|
| 295 |
+
key=lambda x: len(normalize_lao_text(x.get("term", ""))),
|
| 296 |
+
reverse=True
|
| 297 |
+
)
|
| 298 |
+
|
| 299 |
+
for item in sorted_glossary:
|
| 300 |
+
term_raw = item.get("term", "")
|
| 301 |
+
norm_term = normalize_lao_text(term_raw)
|
| 302 |
+
|
| 303 |
+
if not norm_term:
|
| 304 |
+
continue
|
| 305 |
+
|
| 306 |
+
# Condition A: EXACT Match (Perfect precision)
|
| 307 |
+
# Example: User types "ນັກວິທະຍາສາດ"
|
| 308 |
+
is_exact = (norm_msg == norm_term)
|
| 309 |
+
|
| 310 |
+
# Condition B: Substring Match (High precision for questions)
|
| 311 |
+
# Example: User types "ນັກວິທະຍາສາດ ແມ່ນຫຍັງ"
|
| 312 |
+
# We enforce a length check so "Science" doesn't match a huge paragraph about Pollution.
|
| 313 |
+
is_substring = (norm_term in norm_msg) and (len(norm_msg) < len(norm_term) + 20)
|
| 314 |
+
|
| 315 |
+
if is_exact or is_substring:
|
| 316 |
+
definition = item.get("definition", "").strip()
|
| 317 |
+
example = item.get("example", "").strip()
|
| 318 |
+
|
| 319 |
+
# Return the result immediately once the longest match is found
|
| 320 |
+
if example:
|
| 321 |
+
return f"{definition} ຕົວຢ່າງ: {example}"
|
| 322 |
+
return definition
|
| 323 |
+
|
| 324 |
# --- FIX END ---
|
| 325 |
|
| 326 |
+
# If no text match, proceed to Vector Similarity (Tier 2)
|
| 327 |
if qa_store.GLOSSARY_EMBEDDINGS is None:
|
| 328 |
return None
|
| 329 |
|
|
|
|
| 337 |
best_idx = int(np.argmax(sims))
|
| 338 |
best_sim = float(sims[best_idx])
|
| 339 |
|
| 340 |
+
# Threshold 0.65 to prevent weak matches
|
|
|
|
| 341 |
if best_sim < 0.65:
|
| 342 |
return None
|
| 343 |
|
|
|
|
| 347 |
|
| 348 |
if example:
|
| 349 |
return f"{definition} ຕົວຢ່າງ: {example}"
|
| 350 |
+
return definition
|
|
|
|
| 351 |
|
| 352 |
|
| 353 |
# -----------------------------
|