| | |
| | import os |
| | import torch |
| | from sentence_transformers import SentenceTransformer |
| | import qa_store |
| | from loader import load_curriculum, load_glossary |
| |
|
| | |
| | EMBED_MODEL_NAME = "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2" |
| | BASE_DIR = os.path.dirname(os.path.abspath(__file__)) |
| | DATA_DIR = os.path.join(BASE_DIR, "data") |
| | CACHE_FILE = os.path.join(DATA_DIR, "cached_embeddings.pt") |
| |
|
| | def build_and_save(): |
| | print("⏳ Loading data...") |
| | load_curriculum() |
| | load_glossary() |
| |
|
| | print(f"⏳ Loading model: {EMBED_MODEL_NAME}...") |
| | |
| | device = "cuda" if torch.cuda.is_available() else "cpu" |
| | embed_model = SentenceTransformer(EMBED_MODEL_NAME, device=device) |
| |
|
| | |
| | print(f"🧮 Computing embeddings for {len(qa_store.ENTRIES)} textbook entries...") |
| | textbook_texts = [] |
| | for e in qa_store.ENTRIES: |
| | chapter = e.get("chapter_title", "") or e.get("chapter", "") or "" |
| | section = e.get("section_title", "") or e.get("section", "") or "" |
| | text = e.get("text", "") or "" |
| | combined = f"{chapter}\n{section}\n{text}" |
| | textbook_texts.append(combined) |
| |
|
| | if textbook_texts: |
| | textbook_embeddings = embed_model.encode( |
| | textbook_texts, |
| | convert_to_tensor=True, |
| | show_progress_bar=True |
| | ) |
| | else: |
| | textbook_embeddings = None |
| |
|
| | |
| | print(f"🧮 Computing embeddings for {len(qa_store.GLOSSARY)} glossary terms...") |
| | glossary_texts = [ |
| | f"{item.get('term', '')} :: {item.get('definition', '')}" |
| | for item in qa_store.GLOSSARY |
| | ] |
| | |
| | if glossary_texts: |
| | glossary_embeddings = embed_model.encode( |
| | glossary_texts, |
| | convert_to_numpy=True, |
| | normalize_embeddings=True, |
| | show_progress_bar=True |
| | ) |
| | else: |
| | glossary_embeddings = None |
| |
|
| | |
| | print(f"💾 Saving to {CACHE_FILE}...") |
| | torch.save({ |
| | "textbook": textbook_embeddings, |
| | "glossary": glossary_embeddings |
| | }, CACHE_FILE) |
| | |
| | print("✅ Done! You can now upload 'data/cached_embeddings.pt' to Hugging Face.") |
| |
|
| | if __name__ == "__main__": |
| | build_and_save() |