Heng2004 commited on
Commit
2d6429a
·
verified ·
1 Parent(s): ab33eee

Create build_cache.py

Browse files
Files changed (1) hide show
  1. build_cache.py +70 -0
build_cache.py ADDED
@@ -0,0 +1,70 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # build_cache.py
2
+ import os
3
+ import torch
4
+ from sentence_transformers import SentenceTransformer
5
+ import qa_store
6
+ from loader import load_curriculum, load_glossary
7
+
8
+ # 1. Configuration
9
+ EMBED_MODEL_NAME = "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2"
10
+ BASE_DIR = os.path.dirname(os.path.abspath(__file__))
11
+ DATA_DIR = os.path.join(BASE_DIR, "data")
12
+ CACHE_FILE = os.path.join(DATA_DIR, "cached_embeddings.pt")
13
+
14
+ def build_and_save():
15
+ print("⏳ Loading data...")
16
+ load_curriculum()
17
+ load_glossary()
18
+
19
+ print(f"⏳ Loading model: {EMBED_MODEL_NAME}...")
20
+ # Use CPU for build script to ensure compatibility, or cuda if you have it
21
+ device = "cuda" if torch.cuda.is_available() else "cpu"
22
+ embed_model = SentenceTransformer(EMBED_MODEL_NAME, device=device)
23
+
24
+ # --- 2. Build Textbook Embeddings ---
25
+ print(f"🧮 Computing embeddings for {len(qa_store.ENTRIES)} textbook entries...")
26
+ textbook_texts = []
27
+ for e in qa_store.ENTRIES:
28
+ chapter = e.get("chapter_title", "") or e.get("chapter", "") or ""
29
+ section = e.get("section_title", "") or e.get("section", "") or ""
30
+ text = e.get("text", "") or ""
31
+ combined = f"{chapter}\n{section}\n{text}"
32
+ textbook_texts.append(combined)
33
+
34
+ if textbook_texts:
35
+ textbook_embeddings = embed_model.encode(
36
+ textbook_texts,
37
+ convert_to_tensor=True,
38
+ show_progress_bar=True
39
+ )
40
+ else:
41
+ textbook_embeddings = None
42
+
43
+ # --- 3. Build Glossary Embeddings ---
44
+ print(f"🧮 Computing embeddings for {len(qa_store.GLOSSARY)} glossary terms...")
45
+ glossary_texts = [
46
+ f"{item.get('term', '')} :: {item.get('definition', '')}"
47
+ for item in qa_store.GLOSSARY
48
+ ]
49
+
50
+ if glossary_texts:
51
+ glossary_embeddings = embed_model.encode(
52
+ glossary_texts,
53
+ convert_to_numpy=True,
54
+ normalize_embeddings=True,
55
+ show_progress_bar=True
56
+ )
57
+ else:
58
+ glossary_embeddings = None
59
+
60
+ # --- 4. Save to Disk ---
61
+ print(f"💾 Saving to {CACHE_FILE}...")
62
+ torch.save({
63
+ "textbook": textbook_embeddings,
64
+ "glossary": glossary_embeddings
65
+ }, CACHE_FILE)
66
+
67
+ print("✅ Done! You can now upload 'data/cached_embeddings.pt' to Hugging Face.")
68
+
69
+ if __name__ == "__main__":
70
+ build_and_save()