Heng2004 commited on
Commit
31e421c
·
verified ·
1 Parent(s): 1524d32

Update loader.py

Browse files
Files changed (1) hide show
  1. loader.py +114 -186
loader.py CHANGED
@@ -2,32 +2,32 @@
2
  import os
3
  import json
4
  from typing import List, Dict, Any
5
-
6
- from huggingface_hub import hf_hub_download, HfApi
7
- DATASET_REPO_ID = "Heng2004/lao-science-qa-store"
8
- DATASET_FILENAME = "manual_qa.jsonl"
9
-
10
  import qa_store
11
 
12
- # Base paths (make them relative to this file)
 
 
13
  BASE_DIR = os.path.dirname(os.path.abspath(__file__))
14
  DATA_DIR = os.path.join(BASE_DIR, "data")
15
 
16
- CURRICULUM_PATH = os.path.join(DATA_DIR, "M_1_U_1.jsonl")
17
  MANUAL_QA_PATH = os.path.join(DATA_DIR, "manual_qa.jsonl")
18
 
19
- GLOSSARY_PATH = os.path.join(DATA_DIR, "glossary.jsonl")
20
-
21
- # Add this new filename constant
22
  CACHE_FILENAME = "cached_embeddings.pt"
23
  CACHE_PATH = os.path.join(DATA_DIR, CACHE_FILENAME)
24
 
 
 
 
 
 
 
 
25
  def sync_upload_cache() -> str:
26
  """Upload the cached_embeddings.pt to Hugging Face Dataset."""
27
  if not DATASET_REPO_ID or "YOUR_USERNAME" in DATASET_REPO_ID:
28
  return "⚠️ Upload Skipped (Repo ID not set)"
29
-
30
- print(f"[INFO] Uploading {CACHE_FILENAME}...")
31
  try:
32
  from huggingface_hub import HfApi
33
  api = HfApi()
@@ -45,14 +45,10 @@ def sync_upload_cache() -> str:
45
 
46
  def sync_download_cache() -> None:
47
  """Download cached_embeddings.pt at startup."""
48
- if not DATASET_REPO_ID:
49
- return
50
-
51
- print(f"[INFO] Downloading {CACHE_FILENAME}...")
52
  try:
53
  from huggingface_hub import hf_hub_download
54
  import shutil
55
-
56
  downloaded_path = hf_hub_download(
57
  repo_id=DATASET_REPO_ID,
58
  filename=CACHE_FILENAME,
@@ -65,19 +61,11 @@ def sync_download_cache() -> None:
65
  print(f"[WARN] Could not download cache (First run?): {e}")
66
 
67
  def sync_upload_manual_qa() -> str:
68
- """
69
- Upload the local manual_qa.jsonl back to the Hugging Face Dataset repo.
70
- Returns a status message string to display in the UI.
71
- """
72
  if not DATASET_REPO_ID or "YOUR_USERNAME" in DATASET_REPO_ID:
73
- return "⚠️ Upload Skipped (Repo ID not set)"
74
-
75
- print(f"[INFO] Uploading {DATASET_FILENAME} to {DATASET_REPO_ID}...")
76
  try:
77
  from huggingface_hub import HfApi
78
-
79
  api = HfApi()
80
-
81
  api.upload_file(
82
  path_or_fileobj=MANUAL_QA_PATH,
83
  path_in_repo=DATASET_FILENAME,
@@ -85,123 +73,123 @@ def sync_upload_manual_qa() -> str:
85
  repo_type="dataset",
86
  commit_message="Teacher Panel: Updated Q&A data"
87
  )
88
- print("[INFO] Upload success!")
89
  return "☁️ Cloud Upload Success"
90
-
91
  except Exception as e:
92
- print(f"[ERROR] Could not upload manual_qa.jsonl: {e}")
93
  return f"⚠️ Cloud Upload Failed: {e}"
94
 
95
  def sync_download_manual_qa() -> None:
96
- """
97
- Download the latest manual_qa.jsonl from the Hugging Face Dataset repo
98
- at startup so we don't lose previous teacher edits.
99
- """
100
- if not DATASET_REPO_ID or "YOUR_USERNAME" in DATASET_REPO_ID:
101
- print("[WARN] DATASET_REPO_ID is not set. Skipping download.")
102
- return
103
-
104
- print(f"[INFO] Downloading {DATASET_FILENAME} from {DATASET_REPO_ID}...")
105
  try:
106
  from huggingface_hub import hf_hub_download
107
-
108
- # Download file to a temporary path first
109
  downloaded_path = hf_hub_download(
110
  repo_id=DATASET_REPO_ID,
111
  filename=DATASET_FILENAME,
112
  repo_type="dataset",
113
- token=os.environ.get("HF_TOKEN") # Uses the Space's system token
114
  )
115
-
116
- # Copy it to our local data folder
117
- import shutil
118
- target_path = MANUAL_QA_PATH
119
- shutil.copy(downloaded_path, target_path)
120
- print("[INFO] Download success!")
121
-
122
  except Exception as e:
123
  print(f"[WARN] Could not download manual_qa.jsonl: {e}")
124
- print("[INFO] Starting with empty or local manual_qa.jsonl instead.")
 
 
 
 
125
 
126
  def load_curriculum() -> None:
127
  """
128
- Load official textbook JSONL into qa_store.ENTRIES and AUTO_QA_KNOWLEDGE.
 
129
  """
130
  qa_store.ENTRIES.clear()
131
  qa_store.AUTO_QA_KNOWLEDGE.clear()
132
 
133
- if not os.path.exists(CURRICULUM_PATH):
134
- print(f"[WARN] Curriculum file not found: {CURRICULUM_PATH}")
135
- qa_store.RAW_KNOWLEDGE = "ຍັງບໍ່ມີຂໍ້ມູນປະຫວັດສາດຖືກໂຫຼດ."
136
- return
137
-
138
-
139
- with open(CURRICULUM_PATH, "r", encoding="utf-8") as f:
140
- for line in f:
141
- line = line.strip()
142
- if not line:
143
- continue
144
- try:
145
- obj: Dict[str, Any] = json.loads(line)
146
- except json.JSONDecodeError:
147
- print("[WARN] Skipping invalid JSON line in curriculum file.")
148
- continue
149
-
150
-
151
- if "text" not in obj:
152
- continue
153
-
154
- qa_store.ENTRIES.append(obj)
155
-
156
- for pair in obj.get("qa", []):
157
- q = (pair.get("q") or "").strip()
158
- a = (pair.get("a") or "").strip()
159
- if not q or not a:
160
- continue
161
- norm_q = qa_store.normalize_question(q)
162
- qa_store.AUTO_QA_KNOWLEDGE.append(
163
- {
164
- "norm_q": norm_q,
165
- "q": q,
166
- "a": a,
167
- "source": "auto",
168
- "id": obj.get("id", ""),
169
- }
170
- )
171
 
172
  if qa_store.ENTRIES:
173
  qa_store.RAW_KNOWLEDGE = "\n\n".join(e["text"] for e in qa_store.ENTRIES)
 
174
  else:
175
- qa_store.RAW_KNOWLEDGE = "ຍັງບໍ່ມີຂໍ້ມູນປະຫວັດສາດທີ່ອ່ານໄດ້."
176
-
177
-
178
- def load_glossary() -> None:
179
- """Load glossary entries into qa_store.GLOSSARY."""
180
- qa_store.GLOSSARY.clear()
181
 
182
- if not os.path.exists(GLOSSARY_PATH):
183
- print(f"[WARN] Glossary file not found: {GLOSSARY_PATH}")
184
- return
185
 
186
- with open(GLOSSARY_PATH, "r", encoding="utf-8") as f:
 
 
187
  for line in f:
188
  line = line.strip()
189
- if not line:
190
- continue
191
  try:
192
  obj = json.loads(line)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
193
  except json.JSONDecodeError:
194
- print("[WARN] Skipping invalid glossary JSON line")
195
  continue
196
- qa_store.GLOSSARY.append(obj)
197
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
198
  print(f"[INFO] Loaded {len(qa_store.GLOSSARY)} glossary terms.")
199
 
200
 
 
 
 
 
201
  def load_manual_qa() -> None:
202
- """
203
- Load manual_qa.jsonl into qa_store.MANUAL_QA_LIST and MANUAL_QA_INDEX.
204
- """
205
  qa_store.MANUAL_QA_LIST.clear()
206
  qa_store.MANUAL_QA_INDEX.clear()
207
  max_num = 0
@@ -211,120 +199,60 @@ def load_manual_qa() -> None:
211
  qa_store.NEXT_MANUAL_ID = 1
212
  return
213
 
214
-
215
  with open(MANUAL_QA_PATH, "r", encoding="utf-8") as f:
216
  for line in f:
217
  line = line.strip()
218
- if not line:
219
- continue
220
  try:
221
  obj = json.loads(line)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
222
  except json.JSONDecodeError:
223
- print("[WARN] Skipping invalid JSON line in manual QA file.")
224
  continue
225
 
226
-
227
- q = (obj.get("q") or "").strip()
228
- a = (obj.get("a") or "").strip()
229
- if not q or not a:
230
- continue
231
-
232
- entry_id = str(obj.get("id") or "")
233
- if not entry_id:
234
- max_num += 1
235
- entry_id = f"manual_{max_num:04d}"
236
-
237
- # track biggest number in id
238
- import re as _re
239
-
240
- m = _re.search(r"(\d+)$", entry_id)
241
- if m:
242
- max_num = max(max_num, int(m.group(1)))
243
-
244
- norm_q = qa_store.normalize_question(q)
245
- entry = {
246
- "id": entry_id,
247
- "q": q,
248
- "a": a,
249
- "norm_q": norm_q,
250
- }
251
- qa_store.MANUAL_QA_LIST.append(entry)
252
- qa_store.MANUAL_QA_INDEX[norm_q] = entry
253
-
254
  qa_store.NEXT_MANUAL_ID = max_num + 1 if max_num > 0 else 1
255
 
256
-
257
- # loader.py
258
-
259
  def generate_new_manual_id() -> str:
260
- """
261
- Generate the smallest free manual_XXXX ID based on the
262
- current MANUAL_QA_LIST (so gaps like 11 after delete
263
- are reused).
264
- """
265
- import re as _re
266
-
267
  used_nums = set()
268
-
269
- # collect all numbers that are already used in IDs
270
  for e in qa_store.MANUAL_QA_LIST:
271
  raw_id = str(e.get("id") or "")
272
- m = _re.search(r"(\d+)$", raw_id)
273
- if m:
274
- used_nums.add(int(m.group(1)))
275
-
276
- # find the smallest positive integer that is not used
277
  i = 1
278
- while i in used_nums:
279
- i += 1
280
-
281
- # keep the global counter roughly in sync (optional)
282
- qa_store.NEXT_MANUAL_ID = i + 1
283
-
284
  return f"manual_{i:04d}"
285
 
286
-
287
-
288
  def save_manual_qa_file() -> None:
289
- """
290
- Persist MANUAL_QA_LIST to data/manual_qa.jsonl.
291
- """
292
  os.makedirs(os.path.dirname(MANUAL_QA_PATH), exist_ok=True)
293
  with open(MANUAL_QA_PATH, "w", encoding="utf-8") as f:
294
  for e in qa_store.MANUAL_QA_LIST:
295
  obj = {"id": e["id"], "q": e["q"], "a": e["a"]}
296
  f.write(json.dumps(obj, ensure_ascii=False) + "\n")
297
 
298
-
299
  def rebuild_combined_qa() -> None:
300
- """
301
- Combine auto and manual QA into QA_INDEX & ALL_QA_KNOWLEDGE.
302
- Manual answers override auto ones if same normalized question.
303
- """
304
  qa_store.QA_INDEX.clear()
305
  qa_store.ALL_QA_KNOWLEDGE.clear()
306
-
307
- # auto first
308
  for item in qa_store.AUTO_QA_KNOWLEDGE:
309
  norm_q = item["norm_q"]
310
  qa_store.QA_INDEX[norm_q] = item["a"]
311
  qa_store.ALL_QA_KNOWLEDGE.append(item)
312
-
313
- # manual overrides
314
  for e in qa_store.MANUAL_QA_LIST:
315
- item = {
316
- "norm_q": e["norm_q"],
317
- "q": e["q"],
318
- "a": e["a"],
319
- "source": "manual",
320
- "id": e["id"],
321
- }
322
  qa_store.QA_INDEX[item["norm_q"]] = item["a"]
323
  qa_store.ALL_QA_KNOWLEDGE.append(item)
324
 
325
-
326
  def manual_qa_table_data() -> List[List[str]]:
327
- """
328
- Table rows for Teacher Panel.
329
- """
330
- return [[e["id"], e["q"], e["a"]] for e in qa_store.MANUAL_QA_LIST]
 
2
  import os
3
  import json
4
  from typing import List, Dict, Any
 
 
 
 
 
5
  import qa_store
6
 
7
+ # ---------------------------------------------------------
8
+ # CONFIGURATION
9
+ # ---------------------------------------------------------
10
  BASE_DIR = os.path.dirname(os.path.abspath(__file__))
11
  DATA_DIR = os.path.join(BASE_DIR, "data")
12
 
13
+ # Keep Manual QA global so Teacher Panel can write to it easily
14
  MANUAL_QA_PATH = os.path.join(DATA_DIR, "manual_qa.jsonl")
15
 
16
+ # Cache file (Generated locally)
 
 
17
  CACHE_FILENAME = "cached_embeddings.pt"
18
  CACHE_PATH = os.path.join(DATA_DIR, CACHE_FILENAME)
19
 
20
+ DATASET_REPO_ID = "Heng2004/lao-science-qa-store"
21
+ DATASET_FILENAME = "manual_qa.jsonl"
22
+
23
+
24
+ # ---------------------------------------------------------
25
+ # CLOUD SYNC (Unchanged)
26
+ # ---------------------------------------------------------
27
  def sync_upload_cache() -> str:
28
  """Upload the cached_embeddings.pt to Hugging Face Dataset."""
29
  if not DATASET_REPO_ID or "YOUR_USERNAME" in DATASET_REPO_ID:
30
  return "⚠️ Upload Skipped (Repo ID not set)"
 
 
31
  try:
32
  from huggingface_hub import HfApi
33
  api = HfApi()
 
45
 
46
  def sync_download_cache() -> None:
47
  """Download cached_embeddings.pt at startup."""
48
+ if not DATASET_REPO_ID: return
 
 
 
49
  try:
50
  from huggingface_hub import hf_hub_download
51
  import shutil
 
52
  downloaded_path = hf_hub_download(
53
  repo_id=DATASET_REPO_ID,
54
  filename=CACHE_FILENAME,
 
61
  print(f"[WARN] Could not download cache (First run?): {e}")
62
 
63
  def sync_upload_manual_qa() -> str:
 
 
 
 
64
  if not DATASET_REPO_ID or "YOUR_USERNAME" in DATASET_REPO_ID:
65
+ return "⚠️ Upload Skipped"
 
 
66
  try:
67
  from huggingface_hub import HfApi
 
68
  api = HfApi()
 
69
  api.upload_file(
70
  path_or_fileobj=MANUAL_QA_PATH,
71
  path_in_repo=DATASET_FILENAME,
 
73
  repo_type="dataset",
74
  commit_message="Teacher Panel: Updated Q&A data"
75
  )
 
76
  return "☁️ Cloud Upload Success"
 
77
  except Exception as e:
 
78
  return f"⚠️ Cloud Upload Failed: {e}"
79
 
80
  def sync_download_manual_qa() -> None:
81
+ if not DATASET_REPO_ID: return
 
 
 
 
 
 
 
 
82
  try:
83
  from huggingface_hub import hf_hub_download
84
+ import shutil
 
85
  downloaded_path = hf_hub_download(
86
  repo_id=DATASET_REPO_ID,
87
  filename=DATASET_FILENAME,
88
  repo_type="dataset",
89
+ token=os.environ.get("HF_TOKEN")
90
  )
91
+ shutil.copy(downloaded_path, MANUAL_QA_PATH)
92
+ print("[INFO] Manual QA download success!")
 
 
 
 
 
93
  except Exception as e:
94
  print(f"[WARN] Could not download manual_qa.jsonl: {e}")
95
+
96
+
97
+ # ---------------------------------------------------------
98
+ # RECURSIVE LOADERS (The New Upgrade)
99
+ # ---------------------------------------------------------
100
 
101
  def load_curriculum() -> None:
102
  """
103
+ Recursively find and load all textbook JSONL files in data/
104
+ Looks for files named 'textbook.jsonl' OR starting with 'M'.
105
  """
106
  qa_store.ENTRIES.clear()
107
  qa_store.AUTO_QA_KNOWLEDGE.clear()
108
 
109
+ print(f"[INFO] Scanning {DATA_DIR} for textbook content...")
110
+
111
+ file_count = 0
112
+ # os.walk goes deep into M_1/U_1/...
113
+ for root, dirs, files in os.walk(DATA_DIR):
114
+ for file in files:
115
+ # Logic: Match specific filenames
116
+ is_textbook = file == "textbook.jsonl" or (file.startswith("M") and file.endswith(".jsonl"))
117
+
118
+ if is_textbook:
119
+ full_path = os.path.join(root, file)
120
+ _parse_curriculum_file(full_path)
121
+ file_count += 1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
122
 
123
  if qa_store.ENTRIES:
124
  qa_store.RAW_KNOWLEDGE = "\n\n".join(e["text"] for e in qa_store.ENTRIES)
125
+ print(f"[INFO] Loaded {len(qa_store.ENTRIES)} entries from {file_count} files.")
126
  else:
127
+ qa_store.RAW_KNOWLEDGE = "ຍັງບໍ່ມີຂໍ້ມູນ."
128
+ print("[WARN] No curriculum files found.")
 
 
 
 
129
 
 
 
 
130
 
131
+ def _parse_curriculum_file(path: str):
132
+ """Helper to read a single textbook file"""
133
+ with open(path, "r", encoding="utf-8") as f:
134
  for line in f:
135
  line = line.strip()
136
+ if not line: continue
 
137
  try:
138
  obj = json.loads(line)
139
+ if "text" not in obj: continue
140
+
141
+ qa_store.ENTRIES.append(obj)
142
+
143
+ # Extract Auto-QA
144
+ for pair in obj.get("qa", []):
145
+ q = (pair.get("q") or "").strip()
146
+ a = (pair.get("a") or "").strip()
147
+ if q and a:
148
+ norm_q = qa_store.normalize_question(q)
149
+ qa_store.AUTO_QA_KNOWLEDGE.append({
150
+ "norm_q": norm_q,
151
+ "q": q,
152
+ "a": a,
153
+ "source": "auto",
154
+ "id": obj.get("id", "")
155
+ })
156
  except json.JSONDecodeError:
 
157
  continue
 
158
 
159
+
160
+ def load_glossary() -> None:
161
+ """
162
+ Recursively find and load all glossary JSONL files.
163
+ Looks for files named 'glossary.jsonl' OR starting with 'glossary'.
164
+ """
165
+ qa_store.GLOSSARY.clear()
166
+
167
+ print(f"[INFO] Scanning {DATA_DIR} for glossary files...")
168
+
169
+ for root, dirs, files in os.walk(DATA_DIR):
170
+ for file in files:
171
+ is_glossary = "glossary" in file and file.endswith(".jsonl")
172
+
173
+ if is_glossary:
174
+ full_path = os.path.join(root, file)
175
+ with open(full_path, "r", encoding="utf-8") as f:
176
+ for line in f:
177
+ line = line.strip()
178
+ if not line: continue
179
+ try:
180
+ obj = json.loads(line)
181
+ qa_store.GLOSSARY.append(obj)
182
+ except json.JSONDecodeError:
183
+ continue
184
+
185
  print(f"[INFO] Loaded {len(qa_store.GLOSSARY)} glossary terms.")
186
 
187
 
188
+ # ---------------------------------------------------------
189
+ # MANUAL QA & UTILS (Same as before)
190
+ # ---------------------------------------------------------
191
+
192
  def load_manual_qa() -> None:
 
 
 
193
  qa_store.MANUAL_QA_LIST.clear()
194
  qa_store.MANUAL_QA_INDEX.clear()
195
  max_num = 0
 
199
  qa_store.NEXT_MANUAL_ID = 1
200
  return
201
 
 
202
  with open(MANUAL_QA_PATH, "r", encoding="utf-8") as f:
203
  for line in f:
204
  line = line.strip()
205
+ if not line: continue
 
206
  try:
207
  obj = json.loads(line)
208
+ entry_id = str(obj.get("id") or "")
209
+
210
+ # ID tracking logic
211
+ import re
212
+ m = re.search(r"(\d+)$", entry_id)
213
+ if m: max_num = max(max_num, int(m.group(1)))
214
+
215
+ q = (obj.get("q") or "").strip()
216
+ a = (obj.get("a") or "").strip()
217
+ if q and a:
218
+ norm_q = qa_store.normalize_question(q)
219
+ entry = {"id": entry_id, "q": q, "a": a, "norm_q": norm_q}
220
+ qa_store.MANUAL_QA_LIST.append(entry)
221
+ qa_store.MANUAL_QA_INDEX[norm_q] = entry
222
  except json.JSONDecodeError:
 
223
  continue
224
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
225
  qa_store.NEXT_MANUAL_ID = max_num + 1 if max_num > 0 else 1
226
 
 
 
 
227
  def generate_new_manual_id() -> str:
228
+ import re
 
 
 
 
 
 
229
  used_nums = set()
 
 
230
  for e in qa_store.MANUAL_QA_LIST:
231
  raw_id = str(e.get("id") or "")
232
+ m = re.search(r"(\d+)$", raw_id)
233
+ if m: used_nums.add(int(m.group(1)))
 
 
 
234
  i = 1
235
+ while i in used_nums: i += 1
 
 
 
 
 
236
  return f"manual_{i:04d}"
237
 
 
 
238
  def save_manual_qa_file() -> None:
 
 
 
239
  os.makedirs(os.path.dirname(MANUAL_QA_PATH), exist_ok=True)
240
  with open(MANUAL_QA_PATH, "w", encoding="utf-8") as f:
241
  for e in qa_store.MANUAL_QA_LIST:
242
  obj = {"id": e["id"], "q": e["q"], "a": e["a"]}
243
  f.write(json.dumps(obj, ensure_ascii=False) + "\n")
244
 
 
245
  def rebuild_combined_qa() -> None:
 
 
 
 
246
  qa_store.QA_INDEX.clear()
247
  qa_store.ALL_QA_KNOWLEDGE.clear()
 
 
248
  for item in qa_store.AUTO_QA_KNOWLEDGE:
249
  norm_q = item["norm_q"]
250
  qa_store.QA_INDEX[norm_q] = item["a"]
251
  qa_store.ALL_QA_KNOWLEDGE.append(item)
 
 
252
  for e in qa_store.MANUAL_QA_LIST:
253
+ item = {"norm_q": e["norm_q"], "q": e["q"], "a": e["a"], "source": "manual", "id": e["id"]}
 
 
 
 
 
 
254
  qa_store.QA_INDEX[item["norm_q"]] = item["a"]
255
  qa_store.ALL_QA_KNOWLEDGE.append(item)
256
 
 
257
  def manual_qa_table_data() -> List[List[str]]:
258
+ return [[e["id"], e["q"], e["a"]] for e in qa_store.MANUAL_QA_LIST]