Spaces:

Heng2004
/

Laos-Natural-Science-Chatbot

Sleeping

App Files Files Community

Heng2004 commited on 3 days ago

Commit

1c2d1e9

verified ·

1 Parent(s): 7bcfb76

Update model_utils.py

Browse files

Files changed (1) hide show

model_utils.py +60 -22

model_utils.py CHANGED Viewed

@@ -255,35 +255,75 @@ def retrieve_context(question: str, max_entries: int = MAX_CONTEXT_ENTRIES) -> s
 # -----------------------------
 # Glossary-based answering
 # -----------------------------
 def answer_from_glossary(message: str) -> Optional[str]:
     """
     Try to answer using the glossary index.
-    Priority 1: Exact string match of the Term inside the user's message.
-    Priority 2: Vector embedding match (if confidence is high).
     """
     if not getattr(qa_store, "GLOSSARY", None):
         return None
-    # --- FIX START: Check for EXACT term match first ---
-    # This fixes the issue where "What is Science" matches "Pollution"
-    # just because "Pollution" definition contains the word "Science".
-    normalized_msg = message.lower().strip()
-    for item in qa_store.GLOSSARY:
-        term = item.get("term", "").lower().strip()
-        # If the specific term appears in the message (e.g. "Science" in "What is Science?")
-        if term and term in normalized_msg:
-            # Optional: Check if the message is SHORT (so we don't trigger on long sentences accidentally)
-            if len(normalized_msg) < len(term) + 20:
-                definition = item.get("definition", "").strip()
-                example = item.get("example", "").strip()
-                if example:
-                    return f"{definition} ຕົວຢ່າງ: {example}"
-                return definition
     # --- FIX END ---
-    # If no exact text match, proceed to Vector Similarity (the old code)
     if qa_store.GLOSSARY_EMBEDDINGS is None:
         return None
@@ -297,8 +337,7 @@ def answer_from_glossary(message: str) -> Optional[str]:
     best_idx = int(np.argmax(sims))
     best_sim = float(sims[best_idx])
-    # INCREASE THRESHOLD:
-    # Raised from 0.55 to 0.65 to prevent weak matches (like Science matching Pollution)
     if best_sim < 0.65:
         return None
@@ -308,8 +347,7 @@ def answer_from_glossary(message: str) -> Optional[str]:
     if example:
         return f"{definition} ຕົວຢ່າງ: {example}"
-    else:
-        return definition
 # -----------------------------

 # -----------------------------
 # Glossary-based answering
 # -----------------------------
+def normalize_lao_text(text: str) -> str:
+    """
+    Clean Lao text for accurate matching.
+    Removes punctuation and extra spaces.
+    """
+    if not text:
+        return ""
+    # 1. Lowercase
+    text = text.lower().strip()
+    # 2. Remove punctuation (Using the safe single-quote format)
+    text = re.sub(r'[?.!,;։:\'\""“”‘’]', "", text)
+    # 3. Collapse multiple spaces into one (THIS WAS MISSING)
+    text = re.sub(r"\s+", " ", text)
+    return text.strip()
 def answer_from_glossary(message: str) -> Optional[str]:
     """
     Try to answer using the glossary index.
+    Tier 1: Exact/Substring match (Sorted by Length to fix overlap bugs).
+    Tier 2: Vector embedding match (Fallback).
     """
     if not getattr(qa_store, "GLOSSARY", None):
         return None
+    norm_msg = normalize_lao_text(message)
+    # --- FIX START: Sort by Length + Exact Match ---
+    # 1. Sort glossary terms by length (Longest first)
+    # This ensures we match "ນັກວິທະຍາສາດ" (14 chars) BEFORE "ວິທະຍາສາດ" (11 chars)
+    sorted_glossary = sorted(
+        qa_store.GLOSSARY,
+        key=lambda x: len(normalize_lao_text(x.get("term", ""))),
+        reverse=True
+    )
+    for item in sorted_glossary:
+        term_raw = item.get("term", "")
+        norm_term = normalize_lao_text(term_raw)
+        if not norm_term:
+            continue
+        # Condition A: EXACT Match (Perfect precision)
+        # Example: User types "ນັກວິທະຍາສາດ"
+        is_exact = (norm_msg == norm_term)
+        # Condition B: Substring Match (High precision for questions)
+        # Example: User types "ນັກວິທະຍາສາດ ແມ່ນຫຍັງ"
+        # We enforce a length check so "Science" doesn't match a huge paragraph about Pollution.
+        is_substring = (norm_term in norm_msg) and (len(norm_msg) < len(norm_term) + 20)
+        if is_exact or is_substring:
+            definition = item.get("definition", "").strip()
+            example = item.get("example", "").strip()
+            # Return the result immediately once the longest match is found
+            if example:
+                return f"{definition} ຕົວຢ່າງ: {example}"
+            return definition
     # --- FIX END ---
+    # If no text match, proceed to Vector Similarity (Tier 2)
     if qa_store.GLOSSARY_EMBEDDINGS is None:
         return None
     best_idx = int(np.argmax(sims))
     best_sim = float(sims[best_idx])
+    # Threshold 0.65 to prevent weak matches
     if best_sim < 0.65:
         return None
     if example:
         return f"{definition} ຕົວຢ່າງ: {example}"
+    return definition
 # -----------------------------