File size: 2,343 Bytes
2d6429a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
# build_cache.py
import os
import torch
from sentence_transformers import SentenceTransformer
import qa_store
from loader import load_curriculum, load_glossary

# 1. Configuration
EMBED_MODEL_NAME = "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2"
BASE_DIR = os.path.dirname(os.path.abspath(__file__))
DATA_DIR = os.path.join(BASE_DIR, "data")
CACHE_FILE = os.path.join(DATA_DIR, "cached_embeddings.pt")

def build_and_save():
    print("⏳ Loading data...")
    load_curriculum()
    load_glossary()

    print(f"⏳ Loading model: {EMBED_MODEL_NAME}...")
    # Use CPU for build script to ensure compatibility, or cuda if you have it
    device = "cuda" if torch.cuda.is_available() else "cpu"
    embed_model = SentenceTransformer(EMBED_MODEL_NAME, device=device)

    # --- 2. Build Textbook Embeddings ---
    print(f"🧮 Computing embeddings for {len(qa_store.ENTRIES)} textbook entries...")
    textbook_texts = []
    for e in qa_store.ENTRIES:
        chapter = e.get("chapter_title", "") or e.get("chapter", "") or ""
        section = e.get("section_title", "") or e.get("section", "") or ""
        text = e.get("text", "") or ""
        combined = f"{chapter}\n{section}\n{text}"
        textbook_texts.append(combined)

    if textbook_texts:
        textbook_embeddings = embed_model.encode(
            textbook_texts,
            convert_to_tensor=True,
            show_progress_bar=True
        )
    else:
        textbook_embeddings = None

    # --- 3. Build Glossary Embeddings ---
    print(f"🧮 Computing embeddings for {len(qa_store.GLOSSARY)} glossary terms...")
    glossary_texts = [
        f"{item.get('term', '')} :: {item.get('definition', '')}"
        for item in qa_store.GLOSSARY
    ]
    
    if glossary_texts:
        glossary_embeddings = embed_model.encode(
            glossary_texts,
            convert_to_numpy=True,
            normalize_embeddings=True,
            show_progress_bar=True
        )
    else:
        glossary_embeddings = None

    # --- 4. Save to Disk ---
    print(f"💾 Saving to {CACHE_FILE}...")
    torch.save({
        "textbook": textbook_embeddings,
        "glossary": glossary_embeddings
    }, CACHE_FILE)
    
    print("✅ Done! You can now upload 'data/cached_embeddings.pt' to Hugging Face.")

if __name__ == "__main__":
    build_and_save()