Spaces:

Heng2004
/

Laos-Natural-Science-Chatbot

Sleeping

App Files Files Community

Heng2004 commited on 5 days ago

Commit

31e421c

verified ·

1 Parent(s): 1524d32

Update loader.py

Browse files

Files changed (1) hide show

loader.py +114 -186

loader.py CHANGED Viewed

@@ -2,32 +2,32 @@
 import os
 import json
 from typing import List, Dict, Any
-from huggingface_hub import hf_hub_download, HfApi
-DATASET_REPO_ID = "Heng2004/lao-science-qa-store"
-DATASET_FILENAME = "manual_qa.jsonl"
 import qa_store
-# Base paths (make them relative to this file)
 BASE_DIR = os.path.dirname(os.path.abspath(__file__))
 DATA_DIR = os.path.join(BASE_DIR, "data")
-CURRICULUM_PATH = os.path.join(DATA_DIR, "M_1_U_1.jsonl")
 MANUAL_QA_PATH = os.path.join(DATA_DIR, "manual_qa.jsonl")
-GLOSSARY_PATH = os.path.join(DATA_DIR, "glossary.jsonl")
-# Add this new filename constant
 CACHE_FILENAME = "cached_embeddings.pt"
 CACHE_PATH = os.path.join(DATA_DIR, CACHE_FILENAME)
 def sync_upload_cache() -> str:
     """Upload the cached_embeddings.pt to Hugging Face Dataset."""
     if not DATASET_REPO_ID or "YOUR_USERNAME" in DATASET_REPO_ID:
         return "⚠️ Upload Skipped (Repo ID not set)"
-    print(f"[INFO] Uploading {CACHE_FILENAME}...")
     try:
         from huggingface_hub import HfApi
         api = HfApi()
@@ -45,14 +45,10 @@ def sync_upload_cache() -> str:
 def sync_download_cache() -> None:
     """Download cached_embeddings.pt at startup."""
-    if not DATASET_REPO_ID:
-        return
-    print(f"[INFO] Downloading {CACHE_FILENAME}...")
     try:
         from huggingface_hub import hf_hub_download
         import shutil
         downloaded_path = hf_hub_download(
             repo_id=DATASET_REPO_ID,
             filename=CACHE_FILENAME,
@@ -65,19 +61,11 @@ def sync_download_cache() -> None:
         print(f"[WARN] Could not download cache (First run?): {e}")
 def sync_upload_manual_qa() -> str:
-    """
-    Upload the local manual_qa.jsonl back to the Hugging Face Dataset repo.
-    Returns a status message string to display in the UI.
-    """
     if not DATASET_REPO_ID or "YOUR_USERNAME" in DATASET_REPO_ID:
-        return "⚠️ Upload Skipped (Repo ID not set)"
-    print(f"[INFO] Uploading {DATASET_FILENAME} to {DATASET_REPO_ID}...")
     try:
         from huggingface_hub import HfApi
         api = HfApi()
         api.upload_file(
             path_or_fileobj=MANUAL_QA_PATH,
             path_in_repo=DATASET_FILENAME,
@@ -85,123 +73,123 @@ def sync_upload_manual_qa() -> str:
             repo_type="dataset",
             commit_message="Teacher Panel: Updated Q&A data"
         )
-        print("[INFO] Upload success!")
         return "☁️ Cloud Upload Success"
     except Exception as e:
-        print(f"[ERROR] Could not upload manual_qa.jsonl: {e}")
         return f"⚠️ Cloud Upload Failed: {e}"
 def sync_download_manual_qa() -> None:
-    """
-    Download the latest manual_qa.jsonl from the Hugging Face Dataset repo
-    at startup so we don't lose previous teacher edits.
-    """
-    if not DATASET_REPO_ID or "YOUR_USERNAME" in DATASET_REPO_ID:
-        print("[WARN] DATASET_REPO_ID is not set. Skipping download.")
-        return
-    print(f"[INFO] Downloading {DATASET_FILENAME} from {DATASET_REPO_ID}...")
     try:
         from huggingface_hub import hf_hub_download
-        # Download file to a temporary path first
         downloaded_path = hf_hub_download(
             repo_id=DATASET_REPO_ID,
             filename=DATASET_FILENAME,
             repo_type="dataset",
-            token=os.environ.get("HF_TOKEN") # Uses the Space's system token
         )
-        # Copy it to our local data folder
-        import shutil
-        target_path = MANUAL_QA_PATH
-        shutil.copy(downloaded_path, target_path)
-        print("[INFO] Download success!")
     except Exception as e:
         print(f"[WARN] Could not download manual_qa.jsonl: {e}")
-        print("[INFO] Starting with empty or local manual_qa.jsonl instead.")
 def load_curriculum() -> None:
     """
-    Load official textbook JSONL into qa_store.ENTRIES and AUTO_QA_KNOWLEDGE.
     """
     qa_store.ENTRIES.clear()
     qa_store.AUTO_QA_KNOWLEDGE.clear()
-    if not os.path.exists(CURRICULUM_PATH):
-        print(f"[WARN] Curriculum file not found: {CURRICULUM_PATH}")
-        qa_store.RAW_KNOWLEDGE = "ຍັງບໍ່ມີຂໍ້ມູນປະຫວັດສາດຖືກໂຫຼດ."
-        return
-    with open(CURRICULUM_PATH, "r", encoding="utf-8") as f:
-        for line in f:
-            line = line.strip()
-            if not line:
-                continue
-            try:
-                obj: Dict[str, Any] = json.loads(line)
-            except json.JSONDecodeError:
-                print("[WARN] Skipping invalid JSON line in curriculum file.")
-                continue
-            if "text" not in obj:
-                continue
-            qa_store.ENTRIES.append(obj)
-            for pair in obj.get("qa", []):
-                q = (pair.get("q") or "").strip()
-                a = (pair.get("a") or "").strip()
-                if not q or not a:
-                    continue
-                norm_q = qa_store.normalize_question(q)
-                qa_store.AUTO_QA_KNOWLEDGE.append(
-                    {
-                        "norm_q": norm_q,
-                        "q": q,
-                        "a": a,
-                        "source": "auto",
-                        "id": obj.get("id", ""),
-                    }
-                )
     if qa_store.ENTRIES:
         qa_store.RAW_KNOWLEDGE = "\n\n".join(e["text"] for e in qa_store.ENTRIES)
     else:
-        qa_store.RAW_KNOWLEDGE = "ຍັງບໍ່ມີຂໍ້ມູນປະຫວັດສາດທີ່ອ່ານໄດ້."
-def load_glossary() -> None:
-    """Load glossary entries into qa_store.GLOSSARY."""
-    qa_store.GLOSSARY.clear()
-    if not os.path.exists(GLOSSARY_PATH):
-        print(f"[WARN] Glossary file not found: {GLOSSARY_PATH}")
-        return
-    with open(GLOSSARY_PATH, "r", encoding="utf-8") as f:
         for line in f:
             line = line.strip()
-            if not line:
-                continue
             try:
                 obj = json.loads(line)
             except json.JSONDecodeError:
-                print("[WARN] Skipping invalid glossary JSON line")
                 continue
-            qa_store.GLOSSARY.append(obj)
     print(f"[INFO] Loaded {len(qa_store.GLOSSARY)} glossary terms.")
 def load_manual_qa() -> None:
-    """
-    Load manual_qa.jsonl into qa_store.MANUAL_QA_LIST and MANUAL_QA_INDEX.
-    """
     qa_store.MANUAL_QA_LIST.clear()
     qa_store.MANUAL_QA_INDEX.clear()
     max_num = 0
@@ -211,120 +199,60 @@ def load_manual_qa() -> None:
         qa_store.NEXT_MANUAL_ID = 1
         return
     with open(MANUAL_QA_PATH, "r", encoding="utf-8") as f:
         for line in f:
             line = line.strip()
-            if not line:
-                continue
             try:
                 obj = json.loads(line)
             except json.JSONDecodeError:
-                print("[WARN] Skipping invalid JSON line in manual QA file.")
                 continue
-            q = (obj.get("q") or "").strip()
-            a = (obj.get("a") or "").strip()
-            if not q or not a:
-                continue
-            entry_id = str(obj.get("id") or "")
-            if not entry_id:
-                max_num += 1
-                entry_id = f"manual_{max_num:04d}"
-            # track biggest number in id
-            import re as _re
-            m = _re.search(r"(\d+)$", entry_id)
-            if m:
-                max_num = max(max_num, int(m.group(1)))
-            norm_q = qa_store.normalize_question(q)
-            entry = {
-                "id": entry_id,
-                "q": q,
-                "a": a,
-                "norm_q": norm_q,
-            }
-            qa_store.MANUAL_QA_LIST.append(entry)
-            qa_store.MANUAL_QA_INDEX[norm_q] = entry
     qa_store.NEXT_MANUAL_ID = max_num + 1 if max_num > 0 else 1
-# loader.py
 def generate_new_manual_id() -> str:
-    """
-    Generate the smallest free manual_XXXX ID based on the
-    current MANUAL_QA_LIST (so gaps like 11 after delete
-    are reused).
-    """
-    import re as _re
     used_nums = set()
-    # collect all numbers that are already used in IDs
     for e in qa_store.MANUAL_QA_LIST:
         raw_id = str(e.get("id") or "")
-        m = _re.search(r"(\d+)$", raw_id)
-        if m:
-            used_nums.add(int(m.group(1)))
-    # find the smallest positive integer that is not used
     i = 1
-    while i in used_nums:
-        i += 1
-    # keep the global counter roughly in sync (optional)
-    qa_store.NEXT_MANUAL_ID = i + 1
     return f"manual_{i:04d}"
 def save_manual_qa_file() -> None:
-    """
-    Persist MANUAL_QA_LIST to data/manual_qa.jsonl.
-    """
     os.makedirs(os.path.dirname(MANUAL_QA_PATH), exist_ok=True)
     with open(MANUAL_QA_PATH, "w", encoding="utf-8") as f:
         for e in qa_store.MANUAL_QA_LIST:
             obj = {"id": e["id"], "q": e["q"], "a": e["a"]}
             f.write(json.dumps(obj, ensure_ascii=False) + "\n")
 def rebuild_combined_qa() -> None:
-    """
-    Combine auto and manual QA into QA_INDEX & ALL_QA_KNOWLEDGE.
-    Manual answers override auto ones if same normalized question.
-    """
     qa_store.QA_INDEX.clear()
     qa_store.ALL_QA_KNOWLEDGE.clear()
-    # auto first
     for item in qa_store.AUTO_QA_KNOWLEDGE:
         norm_q = item["norm_q"]
         qa_store.QA_INDEX[norm_q] = item["a"]
         qa_store.ALL_QA_KNOWLEDGE.append(item)
-    # manual overrides
     for e in qa_store.MANUAL_QA_LIST:
-        item = {
-            "norm_q": e["norm_q"],
-            "q": e["q"],
-            "a": e["a"],
-            "source": "manual",
-            "id": e["id"],
-        }
         qa_store.QA_INDEX[item["norm_q"]] = item["a"]
         qa_store.ALL_QA_KNOWLEDGE.append(item)
 def manual_qa_table_data() -> List[List[str]]:
-    """
-    Table rows for Teacher Panel.
-    """
-    return [[e["id"], e["q"], e["a"]] for e in qa_store.MANUAL_QA_LIST]

 import os
 import json
 from typing import List, Dict, Any
 import qa_store
+# ---------------------------------------------------------
+# CONFIGURATION
+# ---------------------------------------------------------
 BASE_DIR = os.path.dirname(os.path.abspath(__file__))
 DATA_DIR = os.path.join(BASE_DIR, "data")
+# Keep Manual QA global so Teacher Panel can write to it easily
 MANUAL_QA_PATH = os.path.join(DATA_DIR, "manual_qa.jsonl")
+# Cache file (Generated locally)
 CACHE_FILENAME = "cached_embeddings.pt"
 CACHE_PATH = os.path.join(DATA_DIR, CACHE_FILENAME)
+DATASET_REPO_ID = "Heng2004/lao-science-qa-store"
+DATASET_FILENAME = "manual_qa.jsonl"
+# ---------------------------------------------------------
+# CLOUD SYNC (Unchanged)
+# ---------------------------------------------------------
 def sync_upload_cache() -> str:
     """Upload the cached_embeddings.pt to Hugging Face Dataset."""
     if not DATASET_REPO_ID or "YOUR_USERNAME" in DATASET_REPO_ID:
         return "⚠️ Upload Skipped (Repo ID not set)"
     try:
         from huggingface_hub import HfApi
         api = HfApi()
 def sync_download_cache() -> None:
     """Download cached_embeddings.pt at startup."""
+    if not DATASET_REPO_ID: return
     try:
         from huggingface_hub import hf_hub_download
         import shutil
         downloaded_path = hf_hub_download(
             repo_id=DATASET_REPO_ID,
             filename=CACHE_FILENAME,
         print(f"[WARN] Could not download cache (First run?): {e}")
 def sync_upload_manual_qa() -> str:
     if not DATASET_REPO_ID or "YOUR_USERNAME" in DATASET_REPO_ID:
+        return "⚠️ Upload Skipped"
     try:
         from huggingface_hub import HfApi
         api = HfApi()
         api.upload_file(
             path_or_fileobj=MANUAL_QA_PATH,
             path_in_repo=DATASET_FILENAME,
             repo_type="dataset",
             commit_message="Teacher Panel: Updated Q&A data"
         )
         return "☁️ Cloud Upload Success"
     except Exception as e:
         return f"⚠️ Cloud Upload Failed: {e}"
 def sync_download_manual_qa() -> None:
+    if not DATASET_REPO_ID: return
     try:
         from huggingface_hub import hf_hub_download
+        import shutil
         downloaded_path = hf_hub_download(
             repo_id=DATASET_REPO_ID,
             filename=DATASET_FILENAME,
             repo_type="dataset",
+            token=os.environ.get("HF_TOKEN")
         )
+        shutil.copy(downloaded_path, MANUAL_QA_PATH)
+        print("[INFO] Manual QA download success!")
     except Exception as e:
         print(f"[WARN] Could not download manual_qa.jsonl: {e}")
+# ---------------------------------------------------------
+# RECURSIVE LOADERS (The New Upgrade)
+# ---------------------------------------------------------
 def load_curriculum() -> None:
     """
+    Recursively find and load all textbook JSONL files in data/
+    Looks for files named 'textbook.jsonl' OR starting with 'M'.
     """
     qa_store.ENTRIES.clear()
     qa_store.AUTO_QA_KNOWLEDGE.clear()
+    print(f"[INFO] Scanning {DATA_DIR} for textbook content...")
+    file_count = 0
+    # os.walk goes deep into M_1/U_1/...
+    for root, dirs, files in os.walk(DATA_DIR):
+        for file in files:
+            # Logic: Match specific filenames
+            is_textbook = file == "textbook.jsonl" or (file.startswith("M") and file.endswith(".jsonl"))
+            if is_textbook:
+                full_path = os.path.join(root, file)
+                _parse_curriculum_file(full_path)
+                file_count += 1
     if qa_store.ENTRIES:
         qa_store.RAW_KNOWLEDGE = "\n\n".join(e["text"] for e in qa_store.ENTRIES)
+        print(f"[INFO] Loaded {len(qa_store.ENTRIES)} entries from {file_count} files.")
     else:
+        qa_store.RAW_KNOWLEDGE = "ຍັງບໍ່ມີຂໍ້ມູນ."
+        print("[WARN] No curriculum files found.")
+def _parse_curriculum_file(path: str):
+    """Helper to read a single textbook file"""
+    with open(path, "r", encoding="utf-8") as f:
         for line in f:
             line = line.strip()
+            if not line: continue
             try:
                 obj = json.loads(line)
+                if "text" not in obj: continue
+                qa_store.ENTRIES.append(obj)
+                # Extract Auto-QA
+                for pair in obj.get("qa", []):
+                    q = (pair.get("q") or "").strip()
+                    a = (pair.get("a") or "").strip()
+                    if q and a:
+                        norm_q = qa_store.normalize_question(q)
+                        qa_store.AUTO_QA_KNOWLEDGE.append({
+                            "norm_q": norm_q,
+                            "q": q,
+                            "a": a,
+                            "source": "auto",
+                            "id": obj.get("id", "")
+                        })
             except json.JSONDecodeError:
                 continue
+def load_glossary() -> None:
+    """
+    Recursively find and load all glossary JSONL files.
+    Looks for files named 'glossary.jsonl' OR starting with 'glossary'.
+    """
+    qa_store.GLOSSARY.clear()
+    print(f"[INFO] Scanning {DATA_DIR} for glossary files...")
+    for root, dirs, files in os.walk(DATA_DIR):
+        for file in files:
+            is_glossary = "glossary" in file and file.endswith(".jsonl")
+            if is_glossary:
+                full_path = os.path.join(root, file)
+                with open(full_path, "r", encoding="utf-8") as f:
+                    for line in f:
+                        line = line.strip()
+                        if not line: continue
+                        try:
+                            obj = json.loads(line)
+                            qa_store.GLOSSARY.append(obj)
+                        except json.JSONDecodeError:
+                            continue
     print(f"[INFO] Loaded {len(qa_store.GLOSSARY)} glossary terms.")
+# ---------------------------------------------------------
+# MANUAL QA & UTILS (Same as before)
+# ---------------------------------------------------------
 def load_manual_qa() -> None:
     qa_store.MANUAL_QA_LIST.clear()
     qa_store.MANUAL_QA_INDEX.clear()
     max_num = 0
         qa_store.NEXT_MANUAL_ID = 1
         return
     with open(MANUAL_QA_PATH, "r", encoding="utf-8") as f:
         for line in f:
             line = line.strip()
+            if not line: continue
             try:
                 obj = json.loads(line)
+                entry_id = str(obj.get("id") or "")
+                # ID tracking logic
+                import re
+                m = re.search(r"(\d+)$", entry_id)
+                if m: max_num = max(max_num, int(m.group(1)))
+                q = (obj.get("q") or "").strip()
+                a = (obj.get("a") or "").strip()
+                if q and a:
+                    norm_q = qa_store.normalize_question(q)
+                    entry = {"id": entry_id, "q": q, "a": a, "norm_q": norm_q}
+                    qa_store.MANUAL_QA_LIST.append(entry)
+                    qa_store.MANUAL_QA_INDEX[norm_q] = entry
             except json.JSONDecodeError:
                 continue
     qa_store.NEXT_MANUAL_ID = max_num + 1 if max_num > 0 else 1
 def generate_new_manual_id() -> str:
+    import re
     used_nums = set()
     for e in qa_store.MANUAL_QA_LIST:
         raw_id = str(e.get("id") or "")
+        m = re.search(r"(\d+)$", raw_id)
+        if m: used_nums.add(int(m.group(1)))
     i = 1
+    while i in used_nums: i += 1
     return f"manual_{i:04d}"
 def save_manual_qa_file() -> None:
     os.makedirs(os.path.dirname(MANUAL_QA_PATH), exist_ok=True)
     with open(MANUAL_QA_PATH, "w", encoding="utf-8") as f:
         for e in qa_store.MANUAL_QA_LIST:
             obj = {"id": e["id"], "q": e["q"], "a": e["a"]}
             f.write(json.dumps(obj, ensure_ascii=False) + "\n")
 def rebuild_combined_qa() -> None:
     qa_store.QA_INDEX.clear()
     qa_store.ALL_QA_KNOWLEDGE.clear()
     for item in qa_store.AUTO_QA_KNOWLEDGE:
         norm_q = item["norm_q"]
         qa_store.QA_INDEX[norm_q] = item["a"]
         qa_store.ALL_QA_KNOWLEDGE.append(item)
     for e in qa_store.MANUAL_QA_LIST:
+        item = {"norm_q": e["norm_q"], "q": e["q"], "a": e["a"], "source": "manual", "id": e["id"]}
         qa_store.QA_INDEX[item["norm_q"]] = item["a"]
         qa_store.ALL_QA_KNOWLEDGE.append(item)
 def manual_qa_table_data() -> List[List[str]]:
+    return [[e["id"], e["q"], e["a"]] for e in qa_store.MANUAL_QA_LIST]