Spaces:

EphAsad
/

BactAID-Demo

Running

App Files Files Community

EphAsad commited on 15 days ago

Commit

f2213be

verified ·

1 Parent(s): 9fc007c

Upload 21 files

Browse files

Files changed (21) hide show

models/genus_xgb.json +3 -0
models/genus_xgb_meta.json +369 -0
rag/context_shaper.py +168 -0
rag/rag_embedder.py +112 -0
rag/rag_generator.py +446 -0
rag/rag_retriever.py +509 -0
rag/species_scorer.py +314 -0
scoring/diagnostic_anchors.py +97 -0
scoring/overall_ranker.py +146 -0
static/eph.jpeg +3 -0
training/__init__.py +2 -0
training/alias_trainer.py +126 -0
training/field_weight_trainer.py +330 -0
training/gold_tester.py +89 -0
training/gold_tests.json +3 -0
training/gold_trainer.py +79 -0
training/hf_sync.py +68 -0
training/parser_eval.py +104 -0
training/rag_index_builder.py +629 -0
training/schema_expander.py +237 -0
training/signal_trainer.py +35 -0

models/genus_xgb.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:6530346e18bdb61e778f887fef7ca33e8e0b56e040ce95287c373073db726cfb
+size 34613851

models/genus_xgb_meta.json ADDED Viewed

	@@ -0,0 +1,369 @@

+{
+  "genus_to_idx": {
+    "Staphylococcus": 0,
+    "Salmonella": 1,
+    "Listeria": 2,
+    "Enterobacter": 3,
+    "Pseudomonas": 4,
+    "Streptococcus": 5,
+    "Enterococcus": 6,
+    "Bacillus": 7,
+    "Shigella": 8,
+    "Escherichia": 9,
+    "Klebsiella": 10,
+    "Proteus": 11,
+    "Vibrio": 12,
+    "Neisseria": 13,
+    "Campylobacter": 14,
+    "Clostridium": 15,
+    "Corynebacterium": 16,
+    "Legionella": 17,
+    "Mycobacterium": 18,
+    "Bacteroides": 19,
+    "Micrococcus": 20,
+    "Erysipelothrix": 21,
+    "Haemophilus": 22,
+    "Aeromonas": 23,
+    "Yersinia": 24,
+    "Acinetobacter": 25,
+    "Serratia": 26,
+    "Morganella": 27,
+    "Providencia": 28,
+    "Burkholderia": 29,
+    "Helicobacter": 30,
+    "Actinomyces": 31,
+    "Nocardia": 32,
+    "Pasteurella": 33,
+    "Citrobacter": 34,
+    "Leptospira": 35,
+    "Alcaligenes": 36,
+    "Shewanella": 37,
+    "Edwardsiella": 38,
+    "Chromobacterium": 39,
+    "Lactobacillus": 40,
+    "Propionibacterium": 41,
+    "Peptostreptococcus": 42,
+    "Veillonella": 43,
+    "Fusobacterium": 44,
+    "Eubacterium": 45,
+    "Halomonas": 46,
+    "Psychrobacter": 47,
+    "Rhodococcus": 48,
+    "Mycoplasma": 49,
+    "Bordetella": 50,
+    "Stenotrophomonas": 51,
+    "Ralstonia": 52,
+    "Achromobacter": 53,
+    "Brucella": 54,
+    "Arthrobacter": 55,
+    "Flavobacterium": 56,
+    "Oerskovia": 57,
+    "Sphingomonas": 58,
+    "Comamonas": 59,
+    "Thermococcus": 60,
+    "Elizabethkingia": 61,
+    "Hafnia": 62,
+    "Raoultella": 63,
+    "Ochrobactrum": 64,
+    "Roseomonas": 65,
+    "Actinobacillus": 66,
+    "Gemella": 67,
+    "Rothia": 68,
+    "Carnobacterium": 69,
+    "Plesiomonas": 70,
+    "Janthinobacterium": 71,
+    "Paenibacillus": 72,
+    "Moraxella": 73,
+    "Aerococcus": 74,
+    "Kocuria": 75,
+    "Leuconostoc": 76,
+    "Arcanobacterium": 77,
+    "Gardnerella": 78,
+    "Porphyromonas": 79,
+    "Prevotella": 80,
+    "Pediococcus": 81,
+    "Weissella": 82,
+    "Lactococcus": 83,
+    "Microbacterium": 84,
+    "Clostridioides": 85,
+    "Cronobacter": 86,
+    "Rhizobium": 87,
+    "Azotobacter": 88,
+    "Spirillum": 89,
+    "Candida": 90,
+    "Cryptococcus": 91,
+    "Saccharomyces": 92,
+    "Rickettsia": 93,
+    "Borrelia": 94,
+    "Chlamydia": 95,
+    "Acidaminococcus": 96,
+    "Bartonella": 97,
+    "Coxiella": 98,
+    "Kingella": 99,
+    "Eikenella": 100,
+    "Bilophila": 101,
+    "Anaerococcus": 102,
+    "Finegoldia": 103,
+    "Parvimonas": 104,
+    "Ruminococcus": 105,
+    "Cutibacterium": 106,
+    "Exiguobacterium": 107,
+    "Kluyvera": 108,
+    "Pluralibacter": 109,
+    "Massilia": 110,
+    "Methylobacterium": 111,
+    "Cupriavidus": 112,
+    "Acidovorax": 113,
+    "Geobacillus": 114,
+    "Trueperella": 115,
+    "Streptomyces": 116,
+    "Thermoactinomyces": 117,
+    "Capnocytophaga": 118,
+    "Cardiobacterium": 119,
+    "Yokenella": 120,
+    "Brevibacterium": 121,
+    "Peptoniphilus": 122,
+    "Weisella": 123,
+    "Saccharopolyspora": 124,
+    "Frankia": 125,
+    "Spiroplasma": 126,
+    "Cedecea": 127,
+    "Photorhabdus": 128,
+    "Abiotrophia": 129,
+    "Cellulomonas": 130,
+    "Leifsonia": 131,
+    "Alicyclobacillus": 132,
+    "Sporolactobacillus": 133,
+    "Leclercia": 134,
+    "Kosakonia": 135,
+    "Bergeyella": 136,
+    "Myroides": 137,
+    "Aggregatibacter": 138,
+    ":": 139
+  },
+  "idx_to_genus": {
+    "0": "Staphylococcus",
+    "1": "Salmonella",
+    "2": "Listeria",
+    "3": "Enterobacter",
+    "4": "Pseudomonas",
+    "5": "Streptococcus",
+    "6": "Enterococcus",
+    "7": "Bacillus",
+    "8": "Shigella",
+    "9": "Escherichia",
+    "10": "Klebsiella",
+    "11": "Proteus",
+    "12": "Vibrio",
+    "13": "Neisseria",
+    "14": "Campylobacter",
+    "15": "Clostridium",
+    "16": "Corynebacterium",
+    "17": "Legionella",
+    "18": "Mycobacterium",
+    "19": "Bacteroides",
+    "20": "Micrococcus",
+    "21": "Erysipelothrix",
+    "22": "Haemophilus",
+    "23": "Aeromonas",
+    "24": "Yersinia",
+    "25": "Acinetobacter",
+    "26": "Serratia",
+    "27": "Morganella",
+    "28": "Providencia",
+    "29": "Burkholderia",
+    "30": "Helicobacter",
+    "31": "Actinomyces",
+    "32": "Nocardia",
+    "33": "Pasteurella",
+    "34": "Citrobacter",
+    "35": "Leptospira",
+    "36": "Alcaligenes",
+    "37": "Shewanella",
+    "38": "Edwardsiella",
+    "39": "Chromobacterium",
+    "40": "Lactobacillus",
+    "41": "Propionibacterium",
+    "42": "Peptostreptococcus",
+    "43": "Veillonella",
+    "44": "Fusobacterium",
+    "45": "Eubacterium",
+    "46": "Halomonas",
+    "47": "Psychrobacter",
+    "48": "Rhodococcus",
+    "49": "Mycoplasma",
+    "50": "Bordetella",
+    "51": "Stenotrophomonas",
+    "52": "Ralstonia",
+    "53": "Achromobacter",
+    "54": "Brucella",
+    "55": "Arthrobacter",
+    "56": "Flavobacterium",
+    "57": "Oerskovia",
+    "58": "Sphingomonas",
+    "59": "Comamonas",
+    "60": "Thermococcus",
+    "61": "Elizabethkingia",
+    "62": "Hafnia",
+    "63": "Raoultella",
+    "64": "Ochrobactrum",
+    "65": "Roseomonas",
+    "66": "Actinobacillus",
+    "67": "Gemella",
+    "68": "Rothia",
+    "69": "Carnobacterium",
+    "70": "Plesiomonas",
+    "71": "Janthinobacterium",
+    "72": "Paenibacillus",
+    "73": "Moraxella",
+    "74": "Aerococcus",
+    "75": "Kocuria",
+    "76": "Leuconostoc",
+    "77": "Arcanobacterium",
+    "78": "Gardnerella",
+    "79": "Porphyromonas",
+    "80": "Prevotella",
+    "81": "Pediococcus",
+    "82": "Weissella",
+    "83": "Lactococcus",
+    "84": "Microbacterium",
+    "85": "Clostridioides",
+    "86": "Cronobacter",
+    "87": "Rhizobium",
+    "88": "Azotobacter",
+    "89": "Spirillum",
+    "90": "Candida",
+    "91": "Cryptococcus",
+    "92": "Saccharomyces",
+    "93": "Rickettsia",
+    "94": "Borrelia",
+    "95": "Chlamydia",
+    "96": "Acidaminococcus",
+    "97": "Bartonella",
+    "98": "Coxiella",
+    "99": "Kingella",
+    "100": "Eikenella",
+    "101": "Bilophila",
+    "102": "Anaerococcus",
+    "103": "Finegoldia",
+    "104": "Parvimonas",
+    "105": "Ruminococcus",
+    "106": "Cutibacterium",
+    "107": "Exiguobacterium",
+    "108": "Kluyvera",
+    "109": "Pluralibacter",
+    "110": "Massilia",
+    "111": "Methylobacterium",
+    "112": "Cupriavidus",
+    "113": "Acidovorax",
+    "114": "Geobacillus",
+    "115": "Trueperella",
+    "116": "Streptomyces",
+    "117": "Thermoactinomyces",
+    "118": "Capnocytophaga",
+    "119": "Cardiobacterium",
+    "120": "Yokenella",
+    "121": "Brevibacterium",
+    "122": "Peptoniphilus",
+    "123": "Weisella",
+    "124": "Saccharopolyspora",
+    "125": "Frankia",
+    "126": "Spiroplasma",
+    "127": "Cedecea",
+    "128": "Photorhabdus",
+    "129": "Abiotrophia",
+    "130": "Cellulomonas",
+    "131": "Leifsonia",
+    "132": "Alicyclobacillus",
+    "133": "Sporolactobacillus",
+    "134": "Leclercia",
+    "135": "Kosakonia",
+    "136": "Bergeyella",
+    "137": "Myroides",
+    "138": "Aggregatibacter",
+    "139": ":"
+  },
+  "n_features": 73,
+  "num_classes": 140,
+  "metrics": {
+    "train_accuracy": 0.9869916267942583,
+    "valid_accuracy": 0.9509569377990431,
+    "best_iteration": 270
+  },
+  "feature_schema_path": "data/feature_schema.json",
+  "feature_names": [
+    "Gram Stain",
+    "Shape",
+    "Haemolysis",
+    "Haemolysis Type",
+    "Catalase",
+    "Oxidase",
+    "Indole",
+    "Urease",
+    "Citrate",
+    "H2S",
+    "DNase",
+    "Lysine Decarboxylase",
+    "Ornithine Decarboxylase",
+    "Arginine dihydrolase",
+    "ONPG",
+    "Nitrate Reduction",
+    "Methyl Red",
+    "VP",
+    "Coagulase",
+    "Lipase Test",
+    "Motility",
+    "Motility Type",
+    "Capsule",
+    "Spore Formation",
+    "Pigment",
+    "Odor",
+    "Colony Pattern",
+    "TSI Pattern",
+    "Temperature_4C",
+    "Temperature_25C",
+    "Temperature_30C",
+    "Temperature_37C",
+    "Temperature_42C",
+    "Lactose Fermentation",
+    "Glucose Fermentation",
+    "Sucrose Fermentation",
+    "Mannitol Fermentation",
+    "Maltose Fermentation",
+    "Sorbitol Fermentation",
+    "Xylose Fermentation",
+    "Rhamnose Fermentation",
+    "Arabinose Fermentation",
+    "Raffinose Fermentation",
+    "Trehalose Fermentation",
+    "Inositol Fermentation",
+    "Oxygen Requirement",
+    "Gas Production",
+    "MacConkey Growth",
+    "Blood Growth",
+    "XLD Growth",
+    "Nutrient Growth",
+    "Cetrimide Growth",
+    "BCYE Growth",
+    "Hektoen Enteric Growth",
+    "Mannitol Salt Growth",
+    "Bordet-Gengou Growth",
+    "Thayer Martin Growth",
+    "Cycloserine Cefoxitin Fructose Growth",
+    "Sabouraud Growth",
+    "Lowenstein-Jensen Growth",
+    "Yeast Extract Mannitol Growth",
+    "BSK Growth",
+    "Brucella Growth",
+    "Charcoal Growth",
+    "BHI Growth",
+    "Ashby Growth",
+    "MRS Growth",
+    "Anaerobic Blood Growth",
+    "BP Growth",
+    "ALOA Growth",
+    "Anaerobic Growth",
+    "Chocolate Growth",
+    "TCBS Growth"
+  ]
+}

rag/context_shaper.py ADDED Viewed

	@@ -0,0 +1,168 @@

+# rag/context_shaper.py
+# ============================================================
+# Context shaper for RAG
+#
+# Goal:
+# - Convert "flattened schema dumps" (Field: Value lines) into
+#   readable evidence blocks the LLM can reason over.
+# - Deterministic, no LLM usage.
+#
+# Works with:
+# - llm_context from rag_retriever (biology-only text)
+# ============================================================
+from __future__ import annotations
+import re
+from typing import Dict, List, Tuple, Optional
+_FIELD_LINE_RE = re.compile(r"^\s*([^:\n]{1,80})\s*:\s*(.+?)\s*$")
+# Some fields are usually lists separated by ; or , or |
+_LIST_LIKE_FIELDS = {
+    "Media Grown On",
+    "Colony Morphology",
+    "Colony Pattern",
+    "Growth Temperature",
+}
+# A light grouping map to turn fields into readable sections.
+# (You can expand this over time.)
+_GROUPS: List[Tuple[str, List[str]]] = [
+    ("Morphology & staining", [
+        "Gram Stain", "Shape", "Cellular Arrangement", "Capsule", "Spore Forming",
+    ]),
+    ("Culture & colony", [
+        "Media Grown On", "Colony Morphology", "Colony Pattern", "Pigment", "Odour",
+        "Haemolysis", "Haemolysis Type",
+    ]),
+    ("Core biochemistry", [
+        "Oxidase", "Catalase", "Indole", "Urease", "Citrate", "Methyl Red", "VP",
+        "Nitrate Reduction", "ONPG", "TSI Pattern", "H2S", "Gas Production",
+        "Glucose Fermentation", "Lactose Fermentation", "Sucrose Fermentation",
+        "Inositol Fermentation", "Mannitol Fermentation",
+    ]),
+    ("Motility & growth conditions", [
+        "Motility", "Motility Type", "Growth Temperature", "NaCl", "NaCl Tolerance",
+        "Oxygen Requirement",
+    ]),
+    ("Other tests", [
+        "DNase", "Esculin Hydrolysis", "Gelatin Hydrolysis",
+        "Lysine Decarboxylase", "Ornithine Decarboxylase", "Arginine Dihydrolase",
+    ]),
+]
+def _is_schema_dump(text: str) -> bool:
+    """
+    Detect if rag context looks like flattened Field: Value lines.
+    """
+    if not text:
+        return False
+    lines = [l for l in text.splitlines() if l.strip()]
+    if len(lines) < 6:
+        return False
+    hits = 0
+    for l in lines[:40]:
+        if _FIELD_LINE_RE.match(l):
+            hits += 1
+    return hits >= max(4, int(0.5 * min(len(lines), 40)))
+def _split_listish(field: str, value: str) -> str:
+    """
+    Normalize list-like values into comma-separated readable text.
+    """
+    v = (value or "").strip()
+    if not v:
+        return v
+    if field in _LIST_LIKE_FIELDS or (";" in v) or ("," in v):
+        parts = [p.strip() for p in re.split(r"[;,\|]+", v) if p.strip()]
+        if parts:
+            return ", ".join(parts)
+    return v
+def _parse_field_lines(text: str) -> Dict[str, str]:
+    """
+    Parse Field: Value lines into a dict. Keeps last occurrence.
+    """
+    out: Dict[str, str] = {}
+    for raw in (text or "").splitlines():
+        line = raw.strip()
+        if not line:
+            continue
+        m = _FIELD_LINE_RE.match(line)
+        if not m:
+            continue
+        field = m.group(1).strip()
+        value = m.group(2).strip()
+        if not field:
+            continue
+        out[field] = _split_listish(field, value)
+    return out
+def _format_grouped_blocks(fields: Dict[str, str]) -> str:
+    """
+    Turn fields into grouped, readable evidence blocks.
+    """
+    used = set()
+    blocks: List[str] = []
+    for title, keys in _GROUPS:
+        lines: List[str] = []
+        for k in keys:
+            if k in fields:
+                val = fields[k]
+                if val and val.lower() != "unknown":
+                    lines.append(f"- {k}: {val}")
+                used.add(k)
+        if lines:
+            blocks.append(f"{title}:\n" + "\n".join(lines))
+    # Any leftovers not in group map
+    leftovers: List[str] = []
+    for k, v in fields.items():
+        if k in used:
+            continue
+        if not v or v.lower() == "unknown":
+            continue
+        leftovers.append(f"- {k}: {v}")
+    if leftovers:
+        blocks.append("Additional traits:\n" + "\n".join(leftovers))
+    return "\n\n".join(blocks).strip()
+def shape_llm_context(
+    llm_context: str,
+    target_genus: str = "",
+    max_chars: int = 1800,
+) -> str:
+    """
+    Main entrypoint.
+    - If context is already narrative, keep it (trim to max_chars).
+    - If it is a schema dump, convert to grouped evidence blocks.
+    """
+    ctx = (llm_context or "").strip()
+    if not ctx:
+        return ""
+    if _is_schema_dump(ctx):
+        fields = _parse_field_lines(ctx)
+        shaped = _format_grouped_blocks(fields)
+        # Add a tiny header to cue the LLM that this is reference evidence
+        if target_genus:
+            shaped = f"Reference evidence for {target_genus} (compiled traits):\n\n{shaped}"
+        else:
+            shaped = f"Reference evidence (compiled traits):\n\n{shaped}"
+        return shaped[:max_chars].strip()
+    # Narrative context: just trim
+    if target_genus:
+        ctx = f"Reference context for {target_genus}:\n\n{ctx}"
+    return ctx[:max_chars].strip()

rag/rag_embedder.py ADDED Viewed

	@@ -0,0 +1,112 @@

+# rag/rag_embedder.py
+# ============================================================
+# Embedding utilities for RAG (knowledge base + queries)
+# Uses a SentenceTransformer model for dense embeddings.
+# ============================================================
+from __future__ import annotations
+import os
+import json
+from typing import List, Dict, Any
+import numpy as np
+from sentence_transformers import SentenceTransformer
+# ------------------------------------------------------------
+# CONFIG
+# ------------------------------------------------------------
+EMBEDDING_MODEL_NAME = "sentence-transformers/all-mpnet-base-v2"
+_model: SentenceTransformer | None = None
+# ------------------------------------------------------------
+# MODEL LOADING
+# ------------------------------------------------------------
+def get_embedder() -> SentenceTransformer:
+    global _model
+    if _model is None:
+        _model = SentenceTransformer(EMBEDDING_MODEL_NAME)
+    return _model
+# ------------------------------------------------------------
+# EMBEDDING
+# ------------------------------------------------------------
+def embed_text(text: str, normalize: bool = True) -> np.ndarray:
+    """
+    Embed a single piece of text.
+    Returns a 1D numpy array (MPNet: 768-dim).
+    """
+    model = get_embedder()
+    emb = model.encode(
+        [text],
+        show_progress_bar=False,
+        normalize_embeddings=normalize,
+    )
+    return emb[0]
+def embed_texts(texts: List[str], normalize: bool = True) -> np.ndarray:
+    """
+    Embed a list of strings -> (N, D) numpy array.
+    """
+    model = get_embedder()
+    return model.encode(
+        texts,
+        show_progress_bar=False,
+        normalize_embeddings=normalize,
+    )
+# ------------------------------------------------------------
+# INDEX LOADING
+# ------------------------------------------------------------
+def load_kb_index(path: str = "data/rag/index/kb_index.json") -> Dict[str, Any]:
+    """
+    Load the RAG knowledge base index JSON.
+    Expected format:
+    {
+      "version": int,
+      "model_name": str,
+      "records": [
+        {
+          "id": str,
+          "genus": str,
+          "species": str | null,
+          "level": "genus" | "species",
+          "chunk_id": int,
+          "source_file": str,
+          "text": str,
+          "embedding": [float, ...]
+        }
+      ]
+    }
+    """
+    if not os.path.exists(path):
+        raise FileNotFoundError(f"KB index not found at {path}")
+    with open(path, "r", encoding="utf-8") as f:
+        data = json.load(f)
+    index_model = data.get("model_name")
+    if index_model != EMBEDDING_MODEL_NAME:
+        raise ValueError(
+            f"KB index built with '{index_model}', "
+            f"but current embedder is '{EMBEDDING_MODEL_NAME}'. "
+            "Rebuild the index."
+        )
+    # Convert embeddings to numpy arrays
+    for rec in data.get("records", []):
+        if isinstance(rec.get("embedding"), list):
+            rec["embedding"] = np.array(rec["embedding"], dtype="float32")
+    return data

rag/rag_generator.py ADDED Viewed

	@@ -0,0 +1,446 @@

+# rag/rag_generator.py
+# ============================================================
+# RAG generator using google/flan-t5-large (CPU-friendly)
+#
+# Goal (user-visible, structured, deterministic-first):
+# - Show the user:
+#     KEY TRAITS:
+#     CONFLICTS:
+#     CONCLUSION:
+# - KEY TRAITS and CONFLICTS are extracted deterministically from the
+#   shaped retriever context (preferred).
+# - The LLM only writes the CONCLUSION (2–5 sentences) based on those
+#   extracted sections.
+#
+# Reliability:
+# - flan-t5 sometimes echoes prompt instructions.
+# - We keep the prompt extremely short and avoid imperative bullet rules.
+# - We keep deterministic fallback logic if the LLM output is garbage/echo.
+#
+# Expected usage:
+#   ctx = retrieve_rag_context(..., parsed_fields=...)
+#   explanation = generate_genus_rag_explanation(
+#       phenotype_text=text,
+#       rag_context=ctx.get("llm_context_shaped") or ctx.get("llm_context"),
+#       genus=genus
+#   )
+#
+# Optional HF Space logs:
+#   export BACTAI_RAG_GEN_LOG_INPUT=1
+#   export BACTAI_RAG_GEN_LOG_OUTPUT=1
+# ============================================================
+from __future__ import annotations
+import os
+import re
+import torch
+from transformers import T5ForConditionalGeneration, T5Tokenizer
+# ------------------------------------------------------------
+# MODEL CONFIG
+# ------------------------------------------------------------
+MODEL_NAME = "google/flan-t5-large"
+_tokenizer: T5Tokenizer | None = None
+_model: T5ForConditionalGeneration | None = None
+# Keep small for CPU + to reduce prompt truncation weirdness
+_MAX_INPUT_TOKENS = 768
+_DEFAULT_MAX_NEW_TOKENS = 160
+# Hard cap the context chars we feed to T5 (prevents the model focusing on junk)
+_CONTEXT_CHAR_CAP = 2400
+def _get_model() -> tuple[T5Tokenizer, T5ForConditionalGeneration]:
+    global _tokenizer, _model
+    if _tokenizer is None or _model is None:
+        _tokenizer = T5Tokenizer.from_pretrained(MODEL_NAME)
+        _model = T5ForConditionalGeneration.from_pretrained(
+            MODEL_NAME,
+            device_map="auto",
+            torch_dtype=torch.float32,
+        )
+    return _tokenizer, _model
+# ------------------------------------------------------------
+# DEBUG LOGGING (HF Space logs)
+# ------------------------------------------------------------
+RAG_GEN_LOG_INPUT = os.getenv("BACTAI_RAG_GEN_LOG_INPUT", "0").strip() == "1"
+RAG_GEN_LOG_OUTPUT = os.getenv("BACTAI_RAG_GEN_LOG_OUTPUT", "0").strip() == "1"
+def _log_block(title: str, body: str) -> None:
+    print("=" * 80)
+    print(f"RAG GENERATOR DEBUG — {title}")
+    print("=" * 80)
+    print(body.strip() if body else "")
+    print()
+# ------------------------------------------------------------
+# PROMPT (LLM WRITES ONLY THE CONCLUSION)
+# ------------------------------------------------------------
+# Intentionally minimal. No "rules list", no bullets specification.
+# The LLM sees ONLY extracted matches/conflicts and writes a short conclusion.
+RAG_PROMPT = """summarize: Evaluate whether the phenotype fits the target genus using the provided matches and conflicts.
+Target genus: {genus}
+Key traits that match:
+{matches}
+Conflicts:
+{conflicts}
+Write a short conclusion (2–5 sentences) stating whether this is a strong, moderate, or tentative genus match, and briefly mention the most important matches and conflicts.
+"""
+# ------------------------------------------------------------
+# OUTPUT CLEANUP + ECHO DETECTION
+# ------------------------------------------------------------
+_BAD_SUBSTRINGS = (
+    "summarize:",
+    "target genus",
+    "key traits that match",
+    "write a short conclusion",
+    "conflicts:",
+)
+def _clean_generation(text: str) -> str:
+    s = (text or "").strip()
+    if not s:
+        return ""
+    # collapse excessive whitespace/newlines
+    s = re.sub(r"\s*\n+\s*", " ", s).strip()
+    s = re.sub(r"\s{2,}", " ", s).strip()
+    # guard runaway length
+    if len(s) > 900:
+        s = s[:900].rstrip() + "..."
+    return s
+def _looks_like_echo_or_garbage(text: str) -> bool:
+    s = (text or "").strip()
+    if not s:
+        return True
+    # extremely short / non-sentence
+    if len(s) < 25:
+        return True
+    low = s.lower()
+    if any(bad in low for bad in _BAD_SUBSTRINGS):
+        return True
+    # Must look like actual prose
+    if "." not in s and "because" not in low and "match" not in low and "fits" not in low:
+        return True
+    return False
+# ------------------------------------------------------------
+# EXTRACT KEY TRAITS + CONFLICTS FROM SHAPED CONTEXT
+# ------------------------------------------------------------
+# Shaped context format (example):
+# KEY MATCHES:
+# - Trait: Value (matches reference: ...)
+#
+# CONFLICTS (observed vs CORE traits):
+# - Trait: Value (conflicts reference: ...)
+# or:
+# CONFLICTS: Not specified.
+_KEY_MATCHES_HEADER_RE = re.compile(r"^\s*KEY MATCHES\s*:\s*$", re.IGNORECASE)
+_CONFLICTS_HEADER_RE = re.compile(r"^\s*CONFLICTS\b.*:\s*$", re.IGNORECASE)
+_CONFLICTS_INLINE_NONE_RE = re.compile(r"^\s*CONFLICTS\s*:\s*not specified\.?\s*$", re.IGNORECASE)
+_MATCH_LINE_RE = re.compile(
+    r"^\s*-\s*([^:]+)\s*:\s*(.+?)\s*\(matches reference:\s*(.+?)\)\s*$",
+    re.IGNORECASE,
+)
+_CONFLICT_LINE_RE = re.compile(
+    r"^\s*-\s*([^:]+)\s*:\s*(.+?)\s*\(conflicts reference:\s*(.+?)\)\s*$",
+    re.IGNORECASE,
+)
+# More permissive bullet capture (if shaper changes slightly)
+_GENERIC_BULLET_RE = re.compile(r"^\s*-\s*(.+?)\s*$")
+def _extract_key_traits_and_conflicts(shaped_ctx: str) -> tuple[list[str], list[str], bool]:
+    """
+    Extracts KEY MATCHES and CONFLICTS bullets from shaped retriever context.
+    Returns:
+      (key_traits, conflicts, found_structured_headers)
+    - key_traits items are short: "Trait: ObservedValue"
+    - conflicts items are short: "Trait: ObservedValue"
+    """
+    key_traits: list[str] = []
+    conflicts: list[str] = []
+    lines = (shaped_ctx or "").splitlines()
+    if not lines:
+        return key_traits, conflicts, False
+    in_matches = False
+    in_conflicts = False
+    saw_headers = False
+    for raw in lines:
+        line = raw.rstrip("\n")
+        # detect headers
+        if _KEY_MATCHES_HEADER_RE.match(line.strip()):
+            in_matches = True
+            in_conflicts = False
+            saw_headers = True
+            continue
+        if _CONFLICTS_INLINE_NONE_RE.match(line.strip()):
+            in_matches = False
+            in_conflicts = False
+            saw_headers = True
+            # explicit "no conflicts"
+            continue
+        if _CONFLICTS_HEADER_RE.match(line.strip()):
+            in_matches = False
+            in_conflicts = True
+            saw_headers = True
+            continue
+        # stop capture if another section begins (common shaper headings)
+        if saw_headers and (line.strip().endswith(":") and not line.strip().startswith("-")):
+            # If it's a new heading (and not one of our two), stop both
+            if not _KEY_MATCHES_HEADER_RE.match(line.strip()) and not _CONFLICTS_HEADER_RE.match(line.strip()):
+                in_matches = False
+                in_conflicts = False
+        # capture bullets under each section
+        if in_matches and line.strip().startswith("-"):
+            m = _MATCH_LINE_RE.match(line.strip())
+            if m:
+                trait = m.group(1).strip()
+                obs = m.group(2).strip()
+                key_traits.append(f"{trait}: {obs}")
+            else:
+                g = _GENERIC_BULLET_RE.match(line.strip())
+                if g:
+                    key_traits.append(g.group(1).strip())
+            continue
+        if in_conflicts and line.strip().startswith("-"):
+            c = _CONFLICT_LINE_RE.match(line.strip())
+            if c:
+                trait = c.group(1).strip()
+                obs = c.group(2).strip()
+                conflicts.append(f"{trait}: {obs}")
+            else:
+                g = _GENERIC_BULLET_RE.match(line.strip())
+                if g:
+                    conflicts.append(g.group(1).strip())
+            continue
+    return key_traits, conflicts, saw_headers
+def _extract_matches_conflicts_legacy(shaped_ctx: str) -> tuple[list[str], list[str]]:
+    """
+    Legacy extraction based purely on (matches reference: ...) / (conflicts reference: ...)
+    anywhere in the text. Useful if headers are missing.
+    """
+    matches: list[str] = []
+    conflicts: list[str] = []
+    for raw in (shaped_ctx or "").splitlines():
+        line = raw.strip()
+        if not line.startswith("-"):
+            continue
+        m = _MATCH_LINE_RE.match(line)
+        if m:
+            trait = m.group(1).strip()
+            obs = m.group(2).strip()
+            matches.append(f"{trait}: {obs}")
+            continue
+        c = _CONFLICT_LINE_RE.match(line)
+        if c:
+            trait = c.group(1).strip()
+            obs = c.group(2).strip()
+            conflicts.append(f"{trait}: {obs}")
+            continue
+    return matches, conflicts
+def _format_bullets(items: list[str], *, none_text: str) -> str:
+    if not items:
+        return none_text
+    return "\n".join(f"- {x}" for x in items)
+# ------------------------------------------------------------
+# DETERMINISTIC CONCLUSION FALLBACK
+# ------------------------------------------------------------
+def _deterministic_conclusion(genus: str, key_traits: list[str], conflicts: list[str]) -> str:
+    g = (genus or "").strip() or "Unknown"
+    m = key_traits[:4]
+    c = conflicts[:2]
+    if m and c:
+        return (
+            f"This is a probable match to {g} because it aligns with key traits such as "
+            f"{', '.join(m)}. However, there are conflicts ({', '.join(c)}), so treat this "
+            f"as a moderate/tentative genus-level fit and consider re-checking the conflicting tests."
+        )
+    if m and not c:
+        return (
+            f"This phenotype is consistent with {g} based on key matching traits such as "
+            f"{', '.join(m)}. No major conflicts were detected against the retrieved core genus traits, "
+            f"supporting a strong genus-level match."
+        )
+    if (not m) and c:
+        return (
+            f"This phenotype does not cleanly fit {g} because it conflicts with core traits "
+            f"({', '.join(c)}). Consider re-checking those tests or comparing against the next-ranked genera."
+        )
+    return (
+        f"Reference evidence was available for {g}, but no clear matches or conflicts could be extracted "
+        f"from the shaped context. Try increasing top_k genus chunks or ensuring parsed_fields are being "
+        f"passed into retrieve_rag_context so the shaper can compute KEY MATCHES and CONFLICTS."
+    )
+def _trim_context(ctx: str) -> str:
+    s = (ctx or "").strip()
+    if not s:
+        return ""
+    if len(s) <= _CONTEXT_CHAR_CAP:
+        return s
+    return s[:_CONTEXT_CHAR_CAP].rstrip() + "\n... (truncated)"
+# ------------------------------------------------------------
+# PUBLIC API
+# ------------------------------------------------------------
+def generate_genus_rag_explanation(
+    phenotype_text: str,
+    rag_context: str,
+    genus: str,
+    max_new_tokens: int = _DEFAULT_MAX_NEW_TOKENS,
+) -> str:
+    """
+    Generates a structured RAG output intended for direct display:
+      KEY TRAITS:
+      - ...
+      CONFLICTS:
+      - ...
+      CONCLUSION:
+      ...
+    Notes:
+    - KEY TRAITS + CONFLICTS are extracted deterministically from the (shaped) context.
+    - The LLM writes only the CONCLUSION.
+    - If the LLM output is garbage/echo, we use a deterministic conclusion fallback.
+    """
+    tokenizer, model = _get_model()
+    genus_clean = (genus or "").strip() or "Unknown"
+    context = _trim_context(rag_context or "")
+    if not context:
+        return (
+            "KEY TRAITS:\n"
+            "- Not specified.\n\n"
+            "CONFLICTS:\n"
+            "- Not specified.\n\n"
+            "CONCLUSION:\n"
+            "No reference evidence was available to evaluate this genus against the observed phenotype."
+        )
+    # Prefer structured extraction (KEY MATCHES / CONFLICTS sections)
+    key_traits, conflicts, saw_headers = _extract_key_traits_and_conflicts(context)
+    # If the headers weren't found or extraction is empty, try legacy extraction
+    if (not saw_headers) or (not key_traits and not conflicts):
+        legacy_matches, legacy_conflicts = _extract_matches_conflicts_legacy(context)
+        if legacy_matches or legacy_conflicts:
+            key_traits = key_traits or legacy_matches
+            conflicts = conflicts or legacy_conflicts
+    key_traits_text = _format_bullets(key_traits, none_text="- Not specified.")
+    conflicts_text = _format_bullets(conflicts, none_text="- Not specified.")
+    # LLM: conclusion only
+    prompt = RAG_PROMPT.format(
+        genus=genus_clean,
+        matches=key_traits_text,
+        conflicts=conflicts_text,
+    )
+    if RAG_GEN_LOG_INPUT:
+        _log_block("PROMPT (CONCLUSION-ONLY)", prompt[:3000] + ("\n...(truncated)" if len(prompt) > 3000 else ""))
+    inputs = tokenizer(
+        prompt,
+        return_tensors="pt",
+        truncation=True,
+        max_length=_MAX_INPUT_TOKENS,
+    ).to(model.device)
+    output = model.generate(
+        **inputs,
+        max_new_tokens=max_new_tokens,
+        temperature=0.0,
+        num_beams=1,
+        do_sample=False,
+        repetition_penalty=1.2,
+        no_repeat_ngram_size=3,
+    )
+    decoded = tokenizer.decode(output[0], skip_special_tokens=True).strip()
+    cleaned = _clean_generation(decoded)
+    if RAG_GEN_LOG_OUTPUT:
+        _log_block("RAW OUTPUT (CONCLUSION)", decoded)
+        _log_block("CLEANED OUTPUT (CONCLUSION)", cleaned)
+    # If LLM output is junk, use deterministic conclusion
+    if _looks_like_echo_or_garbage(cleaned):
+        cleaned = _deterministic_conclusion(genus_clean, key_traits, conflicts)
+        if RAG_GEN_LOG_OUTPUT:
+            _log_block("FALLBACK CONCLUSION (DETERMINISTIC)", cleaned)
+    # Final user-visible structured output
+    final = (
+        "KEY TRAITS:\n"
+        f"{key_traits_text}\n\n"
+        "CONFLICTS:\n"
+        f"{conflicts_text}\n\n"
+        "CONCLUSION:\n"
+        f"{cleaned}"
+    )
+    return final

rag/rag_retriever.py ADDED Viewed

	@@ -0,0 +1,509 @@

+# rag/rag_retriever.py
+# ============================================================
+# RAG retriever (Stage 2 – microbiology-aware)
+#
+# Key change (GENUS-FIRST):
+# - The generator must NOT see multiple species dumps.
+# - We retrieve GENUS-level records only for llm_context/llm_context_shaped.
+# - Species is handled separately (deterministic species_scorer), not via LLM context.
+#
+# Improvements retained:
+# - Source-type weighting (but genus-only for generator)
+# - Genus-aware query expansion
+# - Diversity enforcement (avoid duplicate sources)
+# - Explicit ranking & score annotations for generator (DEBUG ONLY)
+# - OPTIONAL: species evidence scoring (deterministic)
+# - NEW: Context shaper (deterministic) -> resolves conflicts + emits genus-ready summary
+#
+# IMPORTANT:
+# - We return THREE contexts:
+#     1) llm_context         -> GENUS-only raw text (SAFE but unshaped)
+#     2) llm_context_shaped  -> shaped, conflict-aware, generator-friendly
+#     3) debug_context       -> includes RANK/SCORE/WEIGHTS (UI/logging only)
+# ============================================================
+from __future__ import annotations
+from typing import List, Dict, Any, Optional, Tuple
+import re
+import numpy as np
+from rag.rag_embedder import embed_text, load_kb_index
+# deterministic species evidence scorer (separate from generator context)
+try:
+    from rag.species_scorer import score_species_for_genus
+    HAS_SPECIES_SCORER = True
+except Exception:
+    score_species_for_genus = None  # type: ignore
+    HAS_SPECIES_SCORER = False
+# ------------------------------------------------------------
+# Configuration
+# ------------------------------------------------------------
+# NOTE: We keep these for debug display + potential fallback modes.
+SOURCE_TYPE_WEIGHTS = {
+    "species": 1.15,
+    "genus": 1.00,
+    "table": 1.10,
+    "note": 0.85,
+}
+MAX_CHUNKS_PER_SOURCE = 1
+# Context shaping caps (keeps prompt within LLM limits)
+SHAPER_MAX_CORE = 14
+SHAPER_MAX_VARIABLE = 12
+SHAPER_MAX_MATCHES = 14
+SHAPER_MAX_CONFLICTS = 12
+SHAPER_MAX_TOTAL_CHARS = 9000  # final guardrail
+# ------------------------------------------------------------
+# Similarity helper
+# ------------------------------------------------------------
+def _cosine_similarity(a: np.ndarray, b: np.ndarray) -> float:
+    """
+    Cosine similarity for normalized embeddings.
+    Assumes both vectors are already L2-normalized.
+    """
+    return float(np.dot(a, b))
+# ------------------------------------------------------------
+# Context Shaper (deterministic)
+# ------------------------------------------------------------
+_TRAIT_LINE_RE = re.compile(
+    r"^\s*([A-Za-z0-9][A-Za-z0-9 \/\-\(\)\[\]%>=<\+\.]*?)\s*:\s*(.+?)\s*$"
+)
+# Headers / junk lines we don't want treated as traits
+_SHAPER_SKIP_PREFIXES = (
+    "expected fields for species",
+    "expected fields for genus",
+    "reference context",
+    "genus evidence primer",
+)
+def _norm_val(v: str) -> str:
+    s = (v or "").strip()
+    if not s:
+        return ""
+    s = re.sub(r"\s+", " ", s)
+    return s
+def _canon_bool(v: str) -> str:
+    """
+    Canonicalize common boolean-ish microbiology values.
+    Conservative: no inference.
+    """
+    s = _norm_val(v).lower()
+    if s in {"pos", "positive", "+", "reactive"}:
+        return "Positive"
+    if s in {"neg", "negative", "-", "nonreactive", "non-reactive"}:
+        return "Negative"
+    if s in {"none"}:
+        return "None"
+    if s in {"unknown", "not specified", "n/a", "na"}:
+        return "Unknown"
+    if s in {"variable"}:
+        return "Variable"
+    return _norm_val(v)
+def _canon_trait_name(name: str) -> str:
+    s = _norm_val(name)
+    s_low = s.lower()
+    if s_low == "ornitihine decarboxylase":
+        return "Ornithine Decarboxylase"
+    return s
+def _extract_traits_from_text_block(text: str) -> List[Tuple[str, str]]:
+    """
+    Extract (trait, value) pairs from lines like:
+      Trait Name: Value
+    """
+    pairs: List[Tuple[str, str]] = []
+    for raw_line in (text or "").splitlines():
+        line = raw_line.strip()
+        if not line:
+            continue
+        low = line.lower()
+        if any(low.startswith(p) for p in _SHAPER_SKIP_PREFIXES):
+            continue
+        m = _TRAIT_LINE_RE.match(line)
+        if not m:
+            continue
+        k = _canon_trait_name(m.group(1))
+        v = _canon_bool(m.group(2))
+        if not k or not v:
+            continue
+        pairs.append((k, v))
+    return pairs
+def _compare_vals(observed: str, reference: str) -> Optional[bool]:
+    """
+    Returns:
+      True  -> match
+      False -> conflict
+      None  -> cannot compare (unknown/variable/empty)
+    """
+    o = _canon_bool(observed)
+    r = _canon_bool(reference)
+    if not o or o == "Unknown":
+        return None
+    if not r or r in {"Unknown", "Variable"}:
+        return None
+    if o == r:
+        return True
+    # Safe equivalences (very conservative)
+    eq = {
+        ("None", "Negative"),
+        ("Negative", "None"),
+    }
+    if (o, r) in eq:
+        return True
+    return False
+def shape_genus_context(
+    *,
+    target_genus: str,
+    selected_chunks: List[Dict[str, Any]],
+    parsed_fields: Optional[Dict[str, str]] = None,
+) -> str:
+    """
+    Deterministic, GENUS-focused context shaper.
+    It:
+    - aggregates trait lines across retrieved GENUS chunks
+    - identifies CORE traits (single consistent value across chunks)
+    - identifies VARIABLE traits (multiple values across chunks)
+    - if parsed_fields provided, derives:
+        - phenotype-supported matches vs CORE traits
+        - phenotype conflicts vs CORE traits
+    - outputs a compact, reasoning-friendly block for the generator
+    """
+    genus = (target_genus or "").strip() or "Unknown"
+    trait_values: Dict[str, List[str]] = {}
+    for rec in selected_chunks or []:
+        txt = (rec.get("text") or "").strip()
+        if not txt:
+            continue
+        for k, v in _extract_traits_from_text_block(txt):
+            trait_values.setdefault(k, []).append(v)
+    # Reduce to unique canonical values
+    trait_uniques: Dict[str, List[str]] = {}
+    for k, vals in trait_values.items():
+        uniq: List[str] = []
+        for v in vals:
+            vv = _canon_bool(v)
+            if not vv:
+                continue
+            if vv not in uniq:
+                uniq.append(vv)
+        if uniq:
+            trait_uniques[k] = uniq
+    core_traits: List[Tuple[str, str]] = []
+    variable_traits: List[Tuple[str, str]] = []
+    for k, uniq in trait_uniques.items():
+        if len(uniq) == 1:
+            core_traits.append((k, uniq[0]))
+        else:
+            variable_traits.append((k, " / ".join(uniq)))
+    PRIORITY = {
+        "Gram Stain": 1,
+        "Shape": 2,
+        "Motility": 3,
+        "Motility Type": 4,
+        "Oxidase": 5,
+        "Catalase": 6,
+        "Oxygen Requirement": 7,
+        "Lactose Fermentation": 8,
+        "Glucose Fermentation": 9,
+        "H2S": 10,
+        "Indole": 11,
+        "Urease": 12,
+        "Citrate": 13,
+        "ONPG": 14,
+        "NaCl Tolerant (>=6%)": 15,
+        "Media Grown On": 16,
+        "Colony Morphology": 17,
+    }
+    def _sort_key(item: Tuple[str, str]) -> Tuple[int, str]:
+        return (PRIORITY.get(item[0], 999), item[0].lower())
+    core_traits.sort(key=_sort_key)
+    variable_traits.sort(key=_sort_key)
+    core_traits = core_traits[:SHAPER_MAX_CORE]
+    variable_traits = variable_traits[:SHAPER_MAX_VARIABLE]
+    matches: List[str] = []
+    conflicts: List[str] = []
+    if parsed_fields:
+        for k, ref_v in core_traits:
+            obs_v = parsed_fields.get(k)
+            if obs_v is None:
+                continue
+            cmp = _compare_vals(obs_v, ref_v)
+            if cmp is True:
+                matches.append(f"- {k}: {_canon_bool(obs_v)} (matches reference: {ref_v})")
+            elif cmp is False:
+                conflicts.append(f"- {k}: {_canon_bool(obs_v)} (conflicts reference: {ref_v})")
+    matches = matches[:SHAPER_MAX_MATCHES]
+    conflicts = conflicts[:SHAPER_MAX_CONFLICTS]
+    lines: List[str] = []
+    lines.append(f"GENUS SUMMARY (reference-driven): {genus}")
+    if core_traits:
+        lines.append("\nCORE GENUS TRAITS (consistent across retrieved genus references):")
+        for k, v in core_traits:
+            lines.append(f"- {k}: {v}")
+    else:
+        lines.append("\nCORE GENUS TRAITS: Not available from retrieved context.")
+    if variable_traits:
+        lines.append("\nTRAITS VARIABLE ACROSS RETRIEVED GENUS REFERENCES (do not treat as contradictions):")
+        for k, v in variable_traits:
+            lines.append(f"- {k}: Variable ({v})")
+    if parsed_fields:
+        lines.append("\nPHENOTYPE SUPPORT (observed vs CORE traits):")
+        if matches:
+            lines.append("KEY MATCHES:")
+            lines.extend(matches)
+        else:
+            lines.append("KEY MATCHES: Not specified.")
+        if conflicts:
+            lines.append("\nCONFLICTS (observed vs CORE traits):")
+            lines.extend(conflicts)
+        else:
+            lines.append("\nCONFLICTS: Not specified.")
+    shaped = "\n".join(lines).strip()
+    if len(shaped) > SHAPER_MAX_TOTAL_CHARS:
+        shaped = shaped[:SHAPER_MAX_TOTAL_CHARS].rstrip() + "\n... (truncated)"
+    return shaped
+# ------------------------------------------------------------
+# Public API
+# ------------------------------------------------------------
+def retrieve_rag_context(
+    phenotype_text: str,
+    target_genus: str,
+    top_k: int = 5,
+    kb_path: str = "data/rag/index/kb_index.json",
+    parsed_fields: Optional[Dict[str, str]] = None,
+    species_top_n: int = 5,
+    allow_species_fallback: bool = False,
+) -> Dict[str, Any]:
+    """
+    Retrieve the most relevant RAG chunks for a phenotype + genus.
+    GENUS-FIRST behavior:
+    - For LLM generator contexts, we retrieve ONLY genus-level records (level == "genus").
+    - Species is handled separately via deterministic species_scorer.
+    Optional:
+      parsed_fields -> enables species evidence scoring + context shaping matches/conflicts.
+    Returns:
+      {
+        "genus": target_genus,
+        "chunks": [...],               # ranked chunk metadata (GENUS chunks unless fallback enabled)
+        "llm_context": "....",         # GENUS raw text (no scores)
+        "llm_context_shaped": "....",  # deterministic genus-friendly summary
+        "debug_context": "....",       # annotated with rank/score/weights
+        "species_evidence": { ... }    # optional deterministic species scoring
+      }
+    """
+    kb = load_kb_index(kb_path)
+    records = kb.get("records", [])
+    if not records:
+        return {
+            "genus": target_genus,
+            "chunks": [],
+            "llm_context": "",
+            "llm_context_shaped": "",
+            "debug_context": "",
+            "species_evidence": {"genus": target_genus, "ranked": []},
+        }
+    query_text = (phenotype_text or "").strip()
+    if target_genus:
+        query_text = f"{query_text}\nTarget genus: {target_genus}"
+    q_emb = embed_text(query_text, normalize=True)
+    target_genus_lc = (target_genus or "").strip().lower()
+    scored_records: List[Dict[str, Any]] = []
+    # --------------------------------------------------------
+    # Primary pass: STRICT genus-filtered + GENUS-LEVEL only
+    # --------------------------------------------------------
+    for rec in records:
+        rec_genus = (rec.get("genus") or "").strip().lower()
+        if target_genus_lc and rec_genus != target_genus_lc:
+            continue
+        level = (rec.get("level") or "").strip().lower()
+        if level != "genus":
+            continue  # GENUS-ONLY for generator context
+        emb = rec.get("embedding")
+        if emb is None:
+            continue
+        base_score = _cosine_similarity(q_emb, emb)
+        weight = SOURCE_TYPE_WEIGHTS.get(level, 1.0)
+        score = base_score * weight
+        scored_records.append(
+            {
+                "id": rec.get("id"),
+                "genus": rec.get("genus"),
+                "species": rec.get("species"),
+                "source_type": level,
+                "path": rec.get("source_file"),
+                "text": rec.get("text"),
+                "score": float(score),
+                "base_score": float(base_score),
+                "type_weight": float(weight),
+                "section": rec.get("section"),
+                "role": rec.get("role"),
+                "chunk_id": rec.get("chunk_id"),
+            }
+        )
+    # --------------------------------------------------------
+    # Fallback modes
+    # --------------------------------------------------------
+    if not scored_records and allow_species_fallback:
+        # Emergency fallback: allow any level if no genus chunks exist.
+        # This keeps your app functioning, but can reintroduce noise.
+        for rec in records:
+            rec_genus = (rec.get("genus") or "").strip().lower()
+            if target_genus_lc and rec_genus != target_genus_lc:
+                continue
+            emb = rec.get("embedding")
+            if emb is None:
+                continue
+            level = (rec.get("level") or "").strip().lower()
+            base_score = _cosine_similarity(q_emb, emb)
+            weight = SOURCE_TYPE_WEIGHTS.get(level, 1.0)
+            score = base_score * weight
+            scored_records.append(
+                {
+                    "id": rec.get("id"),
+                    "genus": rec.get("genus"),
+                    "species": rec.get("species"),
+                    "source_type": level,
+                    "path": rec.get("source_file"),
+                    "text": rec.get("text"),
+                    "score": float(score),
+                    "base_score": float(base_score),
+                    "type_weight": float(weight),
+                    "section": rec.get("section"),
+                    "role": rec.get("role"),
+                    "chunk_id": rec.get("chunk_id"),
+                }
+            )
+    # Sort by score
+    scored_records.sort(key=lambda r: r["score"], reverse=True)
+    # Diversity enforcement
+    selected: List[Dict[str, Any]] = []
+    source_counts: Dict[str, int] = {}
+    for rec in scored_records:
+        src = rec.get("path") or ""
+        count = source_counts.get(src, 0)
+        if count >= MAX_CHUNKS_PER_SOURCE:
+            continue
+        selected.append(rec)
+        source_counts[src] = count + 1
+        if len(selected) >= top_k:
+            break
+    # Build contexts
+    llm_ctx_parts: List[str] = []
+    debug_ctx_parts: List[str] = []
+    for idx, rec in enumerate(selected, start=1):
+        txt = (rec.get("text") or "").strip()
+        if txt:
+            llm_ctx_parts.append(txt)
+        label = rec.get("genus") or "Unknown genus"
+        if rec.get("species"):
+            label = f"{label} {rec['species']}"
+        debug_ctx_parts.append(
+            f"[RANK {idx} | SCORE {rec['score']:.3f} | BASE {rec['base_score']:.3f} | "
+            f"W {rec['type_weight']:.2f} | {label} — {rec.get('source_type')}]"
+            + (
+                f" [section={rec.get('section')} role={rec.get('role')}]"
+                if rec.get("section") or rec.get("role")
+                else ""
+            )
+            + "\n"
+            + (txt or "")
+        )
+    llm_context = "\n\n".join(llm_ctx_parts).strip()
+    debug_context = "\n\n".join(debug_ctx_parts).strip()
+    llm_context_shaped = shape_genus_context(
+        target_genus=target_genus,
+        selected_chunks=selected,
+        parsed_fields=parsed_fields,
+    )
+    # OPTIONAL: deterministic species evidence scoring
+    species_evidence = {"genus": target_genus, "ranked": []}
+    if parsed_fields and HAS_SPECIES_SCORER and score_species_for_genus is not None:
+        try:
+            species_evidence = score_species_for_genus(
+                target_genus=target_genus,
+                parsed_fields=parsed_fields,
+                top_n=species_top_n,
+            )
+        except Exception:
+            species_evidence = {"genus": target_genus, "ranked": []}
+    return {
+        "genus": target_genus,
+        "chunks": selected,
+        "llm_context": llm_context,
+        "llm_context_shaped": llm_context_shaped,
+        "debug_context": debug_context,
+        "species_evidence": species_evidence,
+    }

rag/species_scorer.py ADDED Viewed

	@@ -0,0 +1,314 @@

+# rag/species_scorer.py
+# ============================================================
+# Species evidence scorer (deterministic, explainable)
+#
+# Given:
+#   - target_genus
+#   - parsed_fields (from fusion)
+# It loads species JSON files under:
+#   data/rag/knowledge_base/<Genus>/*.json  (excluding genus.json)
+#
+# And returns:
+#   - ranked species list with scores
+#   - explicit matches / conflicts
+#   - marker hits (importance-weighted)
+#
+# Notes:
+# - This is NOT an LLM. No speculation.
+# - Handles list-like fields (Media / Colony Morphology) as overlap scores.
+# - Handles P/N/V/Unknown fields.
+# ============================================================
+from __future__ import annotations
+import json
+import os
+import re
+from typing import Any, Dict, List, Tuple, Optional
+KB_ROOT = os.path.join("data", "rag", "knowledge_base")
+UNKNOWN = "Unknown"
+LIST_FIELDS = {
+    "Media Grown On",
+    "Colony Morphology",
+}
+# Importance → weight
+MARKER_WEIGHT = {
+    "high": 3.0,
+    "medium": 2.0,
+    "low": 1.5,
+}
+# Base scoring weights
+FIELD_MATCH_WEIGHT = 1.0
+FIELD_CONFLICT_PENALTY = 1.2   # conflicts hurt slightly more than matches help
+VARIABLE_MATCH_BONUS = 0.2     # weak support if expected is Variable
+def _norm_str(v: Any) -> str:
+    if v is None:
+        return ""
+    return str(v).strip()
+def _norm_val(v: Any) -> str:
+    s = _norm_str(v)
+    return s if s else UNKNOWN
+def _split_semicolon(s: str) -> List[str]:
+    parts = [p.strip() for p in re.split(r"[;,\n]+", s or "") if p.strip()]
+    # normalize case lightly for matching
+    return [p.lower() for p in parts]
+def _as_list_lower(v: Any) -> List[str]:
+    if v is None:
+        return []
+    if isinstance(v, list):
+        return [str(x).strip().lower() for x in v if str(x).strip()]
+    # string fallback
+    return _split_semicolon(str(v))
+def _overlap_score(expected_list: List[str], observed_list: List[str]) -> float:
+    """
+    Jaccard-like overlap, but anchored to expected:
+      score = (# of expected items found) / (# expected)
+    """
+    if not expected_list:
+        return 0.0
+    if not observed_list:
+        return 0.0
+    exp = set(expected_list)
+    obs = set(observed_list)
+    hit = len(exp.intersection(obs))
+    return hit / max(1, len(exp))
+def _load_species_docs_for_genus(target_genus: str) -> List[Dict[str, Any]]:
+    genus = (target_genus or "").strip()
+    if not genus:
+        return []
+    genus_dir = os.path.join(KB_ROOT, genus)
+    if not os.path.isdir(genus_dir):
+        return []
+    docs: List[Dict[str, Any]] = []
+    for fname in sorted(os.listdir(genus_dir)):
+        if not fname.lower().endswith(".json"):
+            continue
+        if fname == "genus.json":
+            continue
+        path = os.path.join(genus_dir, fname)
+        try:
+            with open(path, "r", encoding="utf-8") as f:
+                doc = json.load(f)
+            if isinstance(doc, dict) and doc.get("level") == "species":
+                doc["_source_path"] = os.path.relpath(path)
+                docs.append(doc)
+        except Exception:
+            continue
+    return docs
+def _score_expected_fields(
+    expected_fields: Dict[str, Any],
+    parsed_fields: Dict[str, str],
+) -> Tuple[float, float, List[str], List[str]]:
+    """
+    Returns:
+      (score, possible, matches, conflicts)
+    """
+    score = 0.0
+    possible = 0.0
+    matches: List[str] = []
+    conflicts: List[str] = []
+    for field, expected in (expected_fields or {}).items():
+        exp_norm = expected
+        obs_norm = parsed_fields.get(field, UNKNOWN)
+        # Skip unknown observed
+        if obs_norm == UNKNOWN:
+            continue
+        # List fields: overlap
+        if field in LIST_FIELDS:
+            exp_list = _as_list_lower(exp_norm)
+            obs_list = _as_list_lower(obs_norm)
+            if not exp_list:
+                continue
+            possible += FIELD_MATCH_WEIGHT
+            ov = _overlap_score(exp_list, obs_list)
+            # thresholding: any overlap = support; none = conflict
+            if ov > 0:
+                score += FIELD_MATCH_WEIGHT * ov
+                matches.append(f"{field}: overlap {ov:.2f}")
+            else:
+                score -= FIELD_CONFLICT_PENALTY
+                conflicts.append(f"{field}: expected {expected}, got {obs_norm}")
+            continue
+        exp_val = _norm_val(exp_norm)
+        obs_val = _norm_val(obs_norm)
+        # If expected is Unknown, skip
+        if exp_val == UNKNOWN:
+            continue
+        # If expected is Variable, weakly supportive if observed is known
+        if exp_val == "Variable":
+            possible += VARIABLE_MATCH_BONUS
+            score += VARIABLE_MATCH_BONUS
+            matches.append(f"{field}: expected Variable (observed {obs_val})")
+            continue
+        # Normal exact match
+        possible += FIELD_MATCH_WEIGHT
+        if obs_val == exp_val:
+            score += FIELD_MATCH_WEIGHT
+            matches.append(f"{field}: {obs_val}")
+        else:
+            score -= FIELD_CONFLICT_PENALTY
+            conflicts.append(f"{field}: expected {exp_val}, got {obs_val}")
+    return score, possible, matches, conflicts
+def _score_species_markers(
+    markers: List[Dict[str, Any]],
+    parsed_fields: Dict[str, str],
+) -> Tuple[float, float, List[str], List[str]]:
+    """
+    Weighted marker hits. Markers are higher-signal than generic expected fields.
+    Returns:
+      (score, possible, marker_hits, marker_misses)
+    """
+    score = 0.0
+    possible = 0.0
+    hits: List[str] = []
+    misses: List[str] = []
+    for m in markers or []:
+        field = _norm_str(m.get("field"))
+        val = _norm_val(m.get("value"))
+        importance = _norm_str(m.get("importance")).lower() or "medium"
+        w = MARKER_WEIGHT.get(importance, 2.0)
+        if not field or val == UNKNOWN:
+            continue
+        obs = _norm_val(parsed_fields.get(field, UNKNOWN))
+        if obs == UNKNOWN:
+            continue
+        possible += w
+        if obs == val:
+            score += w
+            hits.append(f"{field}: {obs} ({importance})")
+        else:
+            score -= w * 1.1  # marker conflicts hurt more
+            misses.append(f"{field}: expected {val}, got {obs} ({importance})")
+    return score, possible, hits, misses
+def _to_confidence(raw_score: float, possible: float) -> float:
+    """
+    Convert raw score into 0..1 confidence.
+    We use a bounded transform:
+      - normalize by possible
+      - clamp into [0,1]
+    """
+    if possible <= 0:
+        return 0.0
+    # raw_score can be negative; convert to a 0..1 scale
+    # normalized_score around 0 means mixed evidence
+    normalized = raw_score / possible  # roughly -something .. +1
+    conf = (normalized + 1.0) / 2.0    # map [-1, +1] -> [0,1] (approx)
+    if conf < 0:
+        conf = 0.0
+    if conf > 1:
+        conf = 1.0
+    return float(conf)
+def score_species_for_genus(
+    target_genus: str,
+    parsed_fields: Dict[str, str],
+    top_n: int = 5,
+) -> Dict[str, Any]:
+    """
+    Main entrypoint.
+    Returns:
+      {
+        "genus": "...",
+        "ranked": [
+          {
+            "species": "cloacae",
+            "full_name": "Enterobacter cloacae",
+            "score": 0.87,
+            "raw_score": ...,
+            "possible": ...,
+            "matches": [...],
+            "conflicts": [...],
+            "marker_hits": [...],
+            "marker_conflicts": [...],
+            "source_file": "data/rag/knowledge_base/Enterobacter/cloacae.json"
+          }, ...
+        ]
+      }
+    """
+    docs = _load_species_docs_for_genus(target_genus)
+    if not docs:
+        return {"genus": target_genus, "ranked": []}
+    ranked: List[Dict[str, Any]] = []
+    for doc in docs:
+        genus = _norm_str(doc.get("genus") or target_genus)
+        species = _norm_str(doc.get("species"))
+        full_name = f"{genus} {species}".strip()
+        expected_fields = doc.get("expected_fields") or {}
+        markers = doc.get("species_markers") or []
+        s1, p1, matches, conflicts = _score_expected_fields(expected_fields, parsed_fields)
+        s2, p2, marker_hits, marker_conflicts = _score_species_markers(markers, parsed_fields)
+        raw_score = s1 + s2
+        possible = p1 + p2
+        conf = _to_confidence(raw_score, possible)
+        ranked.append(
+            {
+                "species": species or os.path.splitext(os.path.basename(doc.get("_source_path", "")))[0],
+                "full_name": full_name,
+                "score": conf,
+                "raw_score": raw_score,
+                "possible": possible,
+                "matches": matches,
+                "conflicts": conflicts,
+                "marker_hits": marker_hits,
+                "marker_conflicts": marker_conflicts,
+                "source_file": doc.get("_source_path", ""),
+            }
+        )
+    ranked.sort(key=lambda x: x["score"], reverse=True)
+    return {"genus": target_genus, "ranked": ranked[: max(1, int(top_n))]}

scoring/diagnostic_anchors.py ADDED Viewed

	@@ -0,0 +1,97 @@

+# scoring/diagnostic_anchors.py
+# ============================================================
+# Diagnostic anchor overrides:
+# - If the free-text description clearly contains certain
+#   pathognomonic phrases, boost the corresponding genus
+#   in the unified ranking.
+# ============================================================
+from __future__ import annotations
+from typing import List, Dict, Any
+# Simple v1 — can expand over time
+DIAGNOSTIC_ANCHORS = {
+    "Yersinia": [
+        "bull’s-eye",
+        "bull's eye",
+        "cin agar",
+        "pseudoappendicitis",
+        "pseudo-appendicitis",
+    ],
+    "Campylobacter": [
+        "hippurate",
+        "darting motility",
+    ],
+    "Vibrio": [
+        "tcbs agar",
+        "thiosulfate citrate bile salts sucrose",
+        "yellow colonies on tcbs",
+        "rice-water stool",
+        "rice water stool",
+    ],
+    "Proteus": [
+        "swarming motility",
+        "swarm across the plate",
+        "burnt chocolate odor",
+        "burned chocolate odour",
+    ],
+    "Listeria": [
+        "tumbling motility",
+        "cold enrichment",
+        "grows at 4°c",
+        "4°c enrichment",
+    ],
+    "Clostridioides": [
+        "ccfa agar",
+        "cycloserine cefoxitin fructose agar",
+        "barnyard odor",
+        "ground glass colonies",
+    ],
+}
+def apply_diagnostic_overrides(
+    description_text: str,
+    unified_ranking: List[Dict[str, Any]],
+) -> List[Dict[str, Any]]:
+    """
+    If the input description strongly suggests a particular genus
+    (anchor phrases), boost that genus in the unified ranking.
+    Strategy:
+      - If any anchor phrase for a genus is present in the text,
+        ensure that genus has at least 0.70 combined_score
+        (70% overall) *if it already appears*.
+      - Then re-sort by combined_score.
+    This is conservative: it won't hallucinate genera that aren't
+    already in the top list, but strengthens strong clinical signals.
+    """
+    if not description_text or not unified_ranking:
+        return unified_ranking
+    text_lc = description_text.lower()
+    # Which genera have anchors present?
+    boosted_genera = set()
+    for genus, phrases in DIAGNOSTIC_ANCHORS.items():
+        for p in phrases:
+            if p.lower() in text_lc:
+                boosted_genera.add(genus)
+                break
+    if not boosted_genera:
+        return unified_ranking
+    # Apply boost only if genus already present
+    for item in unified_ranking:
+        g = item.get("genus")
+        if g in boosted_genera:
+            score = float(item.get("combined_score", 0.0))
+            if score < 0.70:
+                item["combined_score"] = 0.70
+                item["combined_percent"] = 70.0
+    unified_ranking.sort(key=lambda d: d.get("combined_score", 0.0), reverse=True)
+    return unified_ranking

scoring/overall_ranker.py ADDED Viewed

	@@ -0,0 +1,146 @@

+# scoring/overall_ranker.py
+# ============================================================
+# Overall Ranker — Probability Normalisation Layer
+#
+# PURPOSE:
+#   - Take already-computed combined scores (Tri-Fusion + ML)
+#   - Normalize top-K into human-interpretable probabilities
+#   - Provide odds per 1000 for UI display
+#
+# IMPORTANT:
+#   - This module DOES NOT assign confidence labels
+#   - Confidence logic lives in app.py (decision-band contract)
+#
+# OUTPUT CONTRACT (STRICT):
+# {
+#   "overall": [
+#       {
+#           "rank": int,
+#           "genus": str,
+#           "combined_score": float,
+#           "normalized_share": float,   # 0–1, sums to 1.0
+#       },
+#       ...
+#   ],
+#   "probabilities_1000": [
+#       {
+#           "genus": str,
+#           "odds_1000": int
+#       },
+#       ...
+#   ]
+# }
+# ============================================================
+from typing import Dict, List, Any
+def compute_overall_scores(
+    ml_scores: List[Dict[str, Any]],
+    tri_scores: Dict[str, float],
+    top_k: int = 5,
+) -> Dict[str, Any]:
+    """
+    Normalize already-computed combined scores into
+    probability shares and odds for the Top-5 decision table.
+    Parameters
+    ----------
+    ml_scores : list of dict
+        Each dict contains at least:
+          { "genus": str, "probability": float }
+        (Used ONLY to determine candidate genera)
+    tri_scores : dict
+        Dict mapping genus -> combined_score (0–1)
+        NOTE: This is already unified (Tri-Fusion + ML).
+    top_k : int
+        Number of top genera to return.
+    Returns
+    -------
+    dict
+        {
+          "overall": [
+              {
+                "rank": int,
+                "genus": str,
+                "combined_score": float,
+                "normalized_share": float
+              }
+          ],
+          "probabilities_1000": [
+              { "genus": str, "odds_1000": int }
+          ]
+        }
+    """
+    # --------------------------------------------------------
+    # 1. Build candidate list
+    # --------------------------------------------------------
+    combined_rows: List[Dict[str, Any]] = []
+    for genus, score in tri_scores.items():
+        try:
+            cs = float(score)
+        except Exception:
+            cs = 0.0
+        if cs > 0:
+            combined_rows.append({
+                "genus": genus,
+                "combined_score": cs
+            })
+    if not combined_rows:
+        return {
+            "overall": [],
+            "probabilities_1000": [],
+        }
+    # --------------------------------------------------------
+    # 2. Sort and trim to top_k
+    # --------------------------------------------------------
+    combined_rows.sort(
+        key=lambda x: x["combined_score"],
+        reverse=True
+    )
+    top = combined_rows[:top_k]
+    # --------------------------------------------------------
+    # 3. Normalize to probability shares (sum = 1.0)
+    # --------------------------------------------------------
+    total_score = sum(x["combined_score"] for x in top)
+    if total_score <= 0:
+        total_score = 1.0  # safety fallback
+    overall: List[Dict[str, Any]] = []
+    probabilities_1000: List[Dict[str, Any]] = []
+    for idx, row in enumerate(top, start=1):
+        share = row["combined_score"] / total_score
+        # Clamp defensively
+        share = max(0.0, min(1.0, share))
+        odds_1000 = int(round(share * 1000))
+        overall.append({
+            "rank": idx,
+            "genus": row["genus"],
+            "combined_score": round(row["combined_score"], 6),
+            "normalized_share": round(share, 6),
+        })
+        probabilities_1000.append({
+            "genus": row["genus"],
+            "odds_1000": odds_1000,
+        })
+    return {
+        "overall": overall,
+        "probabilities_1000": probabilities_1000,
+    }

static/eph.jpeg ADDED Viewed

Git LFS Details

SHA256: 0852f987a45e317f52bfacd47f93df1fb9d2cbcb626be12def47672f400c45f3
Pointer size: 131 Bytes
Size of remote file: 116 kB

training/__init__.py ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ # Marks the 'training' directory as a Python package
2	+

training/alias_trainer.py ADDED Viewed

	@@ -0,0 +1,126 @@

+# training/alias_trainer.py
+# ------------------------------------------------------------
+# Stage 10B - Alias Trainer
+#
+# Learns field/value synonyms from gold tests by comparing:
+#   - expected values (gold standard)
+#   - parsed values (rules + extended)
+#
+# Outputs:
+#   - Updated alias_maps.json
+#
+# This is the core intelligence that allows BactAI-D
+# to understand variations in microbiology language.
+# ------------------------------------------------------------
+import json
+import os
+from collections import defaultdict
+from engine.parser_rules import parse_text_rules
+from engine.parser_ext import parse_text_extended
+GOLD_PATH = "training/gold_tests.json"
+ALIAS_PATH = "data/alias_maps.json"
+def normalise(s):
+    if s is None:
+        return ""
+    return str(s).strip().lower()
+def learn_aliases():
+    """
+    Learns synonym mappings from gold tests.
+    """
+    if not os.path.exists(GOLD_PATH):
+        return {"error": f"Gold tests missing: {GOLD_PATH}"}
+    with open(GOLD_PATH, "r", encoding="utf-8") as f:
+        gold = json.load(f)
+    # Load or create alias map
+    if os.path.exists(ALIAS_PATH):
+        with open(ALIAS_PATH, "r", encoding="utf-8") as f:
+            alias_maps = json.load(f)
+    else:
+        alias_maps = {}
+    # Track suggestions
+    suggestions = defaultdict(lambda: defaultdict(int))
+    # ------------------------------------------------------------
+    # Compare expected vs parsed for all tests
+    # ------------------------------------------------------------
+    for test in gold:
+        text = test.get("input", "")
+        expected = test.get("expected", {})
+        rules = parse_text_rules(text).get("parsed_fields", {})
+        ext = parse_text_extended(text).get("parsed_fields", {})
+        # merge deterministic parsers
+        merged = dict(rules)
+        for k, v in ext.items():
+            if v != "Unknown":
+                merged[k] = v
+        # now compare with expected
+        for field, exp_val in expected.items():
+            exp_norm = normalise(exp_val)
+            got_norm = normalise(merged.get(field, "Unknown"))
+            # Skip correct matches
+            if exp_norm == got_norm:
+                continue
+            # Skip unknown expected
+            if exp_norm in ["", "unknown"]:
+                continue
+            # Mismatched → candidate alias
+            if got_norm not in ["", "unknown"]:
+                suggestions[field][got_norm] += 1
+    # ------------------------------------------------------------
+    # Convert suggestions into alias mappings
+    # ------------------------------------------------------------
+    alias_updates = {}
+    for field, values in suggestions.items():
+        # ignore fields with tiny evidence
+        for wrong_value, count in values.items():
+            if count < 2:
+                continue  # avoid noise
+            # add/update alias
+            if field not in alias_maps:
+                alias_maps[field] = {}
+            # map wrong_value → expected canonical version
+            # canonical version is the most common value in gold_tests for that field
+            canonical = None
+            # determine canonical
+            field_values = [normalise(t["expected"][field]) for t in gold if field in t["expected"]]
+            if field_values:
+                # most common expected value
+                canonical = max(set(field_values), key=field_values.count)
+            if canonical:
+                alias_maps[field][wrong_value] = canonical
+                alias_updates[f"{field}:{wrong_value}"] = canonical
+    # ------------------------------------------------------------
+    # Save alias maps
+    # ------------------------------------------------------------
+    with open(ALIAS_PATH, "w", encoding="utf-8") as f:
+        json.dump(alias_maps, f, indent=2)
+    return {
+        "ok": True,
+        "updated_aliases": alias_updates,
+        "total_updates": len(alias_updates),
+        "alias_map_path": ALIAS_PATH,
+    }

training/field_weight_trainer.py ADDED Viewed

	@@ -0,0 +1,330 @@

+# training/field_weight_trainer.py
+# ------------------------------------------------------------
+# Stage 12A — Train Per-Field Parser Weights from Gold Tests
+#
+# Produces:
+#   data/field_weights.json
+#
+# This script computes reliability scores for:
+#   - parser_rules
+#   - parser_ext
+#   - parser_llm
+#
+# and outputs:
+#   {
+#     "global": { ... },
+#     "fields": { field -> weights },
+#     "meta": { ... }
+#   }
+#
+# These weights are used by parser_fusion (Stage 12B).
+# ------------------------------------------------------------
+from __future__ import annotations
+import argparse
+import json
+import os
+from collections import defaultdict
+from dataclasses import dataclass
+from typing import Any, Dict, List, Optional, Tuple
+# Core parsers
+from engine.parser_rules import parse_text_rules
+from engine.parser_ext import parse_text_extended
+# LLM parser (optional)
+try:
+    from engine.parser_llm import parse_llm as parse_text_llm_local
+except Exception:
+    parse_text_llm_local = None  # gracefully degrade if LLM unavailable
+# ------------------------------------------------------------
+# Constants
+# ------------------------------------------------------------
+DEFAULT_GOLD_PATH = os.path.join("data", "gold_tests.json")
+DEFAULT_OUT_PATH = os.path.join("data", "field_weights.json")
+MISSING_PENALTY = 0.5
+SMOOTHING = 1e-3
+# ------------------------------------------------------------
+# Data Structures
+# ------------------------------------------------------------
+@dataclass
+class ParserOutcome:
+    prediction: Optional[str]
+    correct: bool
+    wrong: bool
+    missing: bool
+@dataclass
+class FieldStats:
+    correct: int = 0
+    wrong: int = 0
+    missing: int = 0
+    def total(self) -> int:
+        return self.correct + self.wrong + self.missing
+    def score(self, missing_penalty: float = MISSING_PENALTY) -> float:
+        if self.total() == 0:
+            return 0.0
+        denom = self.correct + self.wrong + missing_penalty * self.missing
+        if denom == 0:
+            return 0.0
+        return self.correct / denom
+# ------------------------------------------------------------
+# Gold Loading
+# ------------------------------------------------------------
+def _load_gold_tests(path: str) -> List[Dict[str, Any]]:
+    if not os.path.exists(path):
+        raise FileNotFoundError(f"Gold tests not found: {path}")
+    with open(path, "r", encoding="utf-8") as f:
+        data = json.load(f)
+    if not isinstance(data, list):
+        raise ValueError("gold_tests.json must be a list")
+    return data
+def _extract_text_and_expected(test_obj: Dict[str, Any]) -> Tuple[str, Dict[str, str]]:
+    text = (
+        test_obj.get("text")
+        or test_obj.get("description")
+        or test_obj.get("input")
+        or test_obj.get("raw")
+        or ""
+    )
+    if not isinstance(text, str):
+        text = str(text)
+    expected: Dict[str, str] = {}
+    if isinstance(test_obj.get("expected"), dict):
+        for k, v in test_obj["expected"].items():
+            expected[str(k)] = str(v)
+        return text, expected
+    if isinstance(test_obj.get("expected_core"), dict):
+        for k, v in test_obj["expected_core"].items():
+            expected[str(k)] = str(v)
+    if isinstance(test_obj.get("expected_extended"), dict):
+        for k, v in test_obj["expected_extended"].items():
+            expected[str(k)] = str(v)
+    return text, expected
+# ------------------------------------------------------------
+# Parser Execution
+# ------------------------------------------------------------
+def _get_parser_predictions(text: str, include_llm: bool = True) -> Dict[str, Dict[str, str]]:
+    results: Dict[str, Dict[str, str]] = {}
+    r = parse_text_rules(text)
+    results["rules"] = dict(r.get("parsed_fields", {}))
+    e = parse_text_extended(text)
+    results["extended"] = dict(e.get("parsed_fields", {}))
+    llm_values: Dict[str, str] = {}
+    if include_llm and parse_text_llm_local is not None:
+        try:
+            llm_out = parse_text_llm_local(text)
+            llm_values = dict(llm_out.get("parsed_fields", {}))
+        except Exception:
+            llm_values = {}
+    results["llm"] = llm_values
+    return results
+def _outcome_for_field(expected_val: str, predicted_val: Optional[str]) -> ParserOutcome:
+    if predicted_val is None:
+        return ParserOutcome(prediction=None, correct=False, wrong=False, missing=True)
+    if predicted_val == expected_val:
+        return ParserOutcome(prediction=predicted_val, correct=True, wrong=False, missing=False)
+    return ParserOutcome(prediction=predicted_val, correct=False, wrong=True, missing=False)
+# ------------------------------------------------------------
+# Stats Computation
+# ------------------------------------------------------------
+def _compute_stats_from_gold(
+    gold_tests: List[Dict[str, Any]],
+    include_llm: bool = True,
+):
+    field_stats = defaultdict(lambda: defaultdict(FieldStats))
+    global_stats = defaultdict(FieldStats)
+    total_samples = 0
+    for sample in gold_tests:
+        text, expected = _extract_text_and_expected(sample)
+        if not expected:
+            continue
+        total_samples += 1
+        preds = _get_parser_predictions(text, include_llm=include_llm)
+        for field, expected_val in expected.items():
+            expected_val = str(expected_val)
+            for parser_name in ["rules", "extended", "llm"]:
+                if parser_name == "llm" and not include_llm:
+                    continue
+                pred_val = preds.get(parser_name, {}).get(field)
+                outcome = _outcome_for_field(expected_val, pred_val)
+                fs = field_stats[field][parser_name]
+                if outcome.correct:
+                    fs.correct += 1
+                if outcome.wrong:
+                    fs.wrong += 1
+                if outcome.missing:
+                    fs.missing += 1
+                gs = global_stats[parser_name]
+                if outcome.correct:
+                    gs.correct += 1
+                if outcome.wrong:
+                    gs.wrong += 1
+                if outcome.missing:
+                    gs.missing += 1
+    return field_stats, global_stats, total_samples
+def _normalise(weights: Dict[str, float], smoothing: float = SMOOTHING) -> Dict[str, float]:
+    adjusted = {k: max(smoothing, v) for k, v in weights.items()}
+    total = sum(adjusted.values())
+    if total <= 0:
+        n = len(adjusted)
+        return {k: 1.0 / n for k in adjusted}
+    return {k: v / total for k, v in adjusted.items()}
+def _build_weights_json(
+    field_stats,
+    global_stats,
+    total_samples,
+    include_llm=True,
+):
+    # Global scores
+    raw_global = {}
+    for parser_name, stats in global_stats.items():
+        if parser_name == "llm" and not include_llm:
+            continue
+        raw_global[parser_name] = stats.score(MISSING_PENALTY)
+    global_weights = _normalise(raw_global)
+    # Per-field
+    fields_block = {}
+    for field_name, stats_dict in field_stats.items():
+        raw_scores = {}
+        total_support = 0
+        for parser_name, stats in stats_dict.items():
+            if parser_name == "llm" and not include_llm:
+                continue
+            raw_scores[parser_name] = stats.score(MISSING_PENALTY)
+            total_support += stats.total()
+        if total_support < 5:
+            # low support → blend global + local
+            local_norm = _normalise(raw_scores)
+            mixed = {}
+            for p in global_weights:
+                mixed[p] = 0.7 * global_weights[p] + 0.3 * local_norm.get(p, global_weights[p])
+            field_w = _normalise(mixed)
+        else:
+            field_w = _normalise(raw_scores)
+        fields_block[field_name] = {
+            **field_w,
+            "support": total_support,
+        }
+    return {
+        "global": global_weights,
+        "fields": fields_block,
+        "meta": {
+            "total_samples": total_samples,
+            "missing_penalty": MISSING_PENALTY,
+            "smoothing": SMOOTHING,
+            "include_llm": include_llm,
+        },
+    }
+# ------------------------------------------------------------
+# Public API
+# ------------------------------------------------------------
+def train_field_weights(
+    gold_path: str = DEFAULT_GOLD_PATH,
+    out_path: str = DEFAULT_OUT_PATH,
+    include_llm: bool = False,
+):
+    print(f"[12A] Loading gold tests: {gold_path}")
+    gold = _load_gold_tests(gold_path)
+    print(f"[12A] {len(gold)} gold samples loaded")
+    field_stats, global_stats, total_samples = _compute_stats_from_gold(
+        gold, include_llm=include_llm
+    )
+    print("[12A] Computing weights...")
+    weights = _build_weights_json(
+        field_stats, global_stats, total_samples, include_llm=include_llm
+    )
+    out_dir = os.path.dirname(out_path)
+    if out_dir and not os.path.exists(out_dir):
+        os.makedirs(out_dir, exist_ok=True)
+    print(f"[12A] Writing: {out_path}")
+    with open(out_path, "w", encoding="utf-8") as f:
+        json.dump(weights, f, indent=2, ensure_ascii=False)
+    print("[12A] Done.")
+    return weights
+# ------------------------------------------------------------
+# CLI
+# ------------------------------------------------------------
+def _parse_args(argv=None):
+    p = argparse.ArgumentParser(description="Stage 12A — Train parser weights")
+    p.add_argument("--gold", type=str, default=DEFAULT_GOLD_PATH)
+    p.add_argument("--out", type=str, default=DEFAULT_OUT_PATH)
+    p.add_argument("--include-llm", action="store_true")
+    return p.parse_args(argv)
+def main(argv=None):
+    args = _parse_args(argv)
+    train_field_weights(
+        gold_path=args.gold,
+        out_path=args.out,
+        include_llm=args.include_llm,
+    )
+if __name__ == "__main__":
+    main()

training/gold_tester.py ADDED Viewed

	@@ -0,0 +1,89 @@

+# training/gold_tester.py
+# ------------------------------------------------------------
+# Stage 10A: Evaluate parsers on gold tests.
+# This MUST NOT crash during import.
+# ------------------------------------------------------------
+from __future__ import annotations
+import json
+import os
+from typing import Dict, Any, List
+from engine.parser_rules import parse_text_rules
+from engine.parser_ext import parse_text_extended
+GOLD_PATH = "training/gold_tests.json"
+REPORT_DIR = "reports"
+def _load_gold_tests() -> List[Dict[str, Any]]:
+    if not os.path.exists(GOLD_PATH):
+        return []
+    with open(GOLD_PATH, "r", encoding="utf-8") as f:
+        try:
+            data = json.load(f)
+            return data if isinstance(data, list) else []
+        except Exception:
+            return []
+def run_gold_tests(mode: str = "rules") -> Dict[str, Any]:
+    gold_tests = _load_gold_tests()
+    if not gold_tests:
+        return {
+            "summary": {
+                "mode": mode,
+                "tests": 0,
+                "total_correct": 0,
+                "total_fields": 0,
+                "overall_accuracy": 0.0,
+                "proposals_path": "data/extended_proposals.jsonl",
+            }
+        }
+    os.makedirs(REPORT_DIR, exist_ok=True)
+    wrong_cases = []
+    total_correct = 0
+    total_fields = 0
+    for idx, test in enumerate(gold_tests):
+        text = test.get("input", "")
+        expected = test.get("expected", {})
+        if mode == "rules":
+            parsed = parse_text_rules(text).get("parsed_fields", {})
+        elif mode == "rules+extended":
+            rule_fields = parse_text_rules(text).get("parsed_fields", {})
+            ext_fields = parse_text_extended(text).get("parsed_fields", {})
+            parsed = {**rule_fields, **ext_fields}
+        else:
+            parsed = {}
+        # Compare field-by-field
+        correct_count = 0
+        for key, val in expected.items():
+            total_fields += 1
+            if key in parsed and str(parsed[key]).strip().lower() == str(val).strip().lower():
+                correct_count += 1
+        total_correct += correct_count
+        if correct_count < len(expected):
+            wrong_cases.append(idx)
+    accuracy = total_correct / total_fields if total_fields else 0.0
+    summary = {
+        "mode": mode,
+        "tests": len(gold_tests),
+        "total_correct": total_correct,
+        "total_fields": total_fields,
+        "overall_accuracy": accuracy,
+        "wrong_cases": wrong_cases,
+        "proposals_path": "data/extended_proposals.jsonl",
+    }
+    return {"summary": summary}

training/gold_tests.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:eb94485222e9733a0d530d4df8ac0f35c8f95770cc9ef44bcb1289807e0b108e
+size 18563634

training/gold_trainer.py ADDED Viewed

	@@ -0,0 +1,79 @@

+# training/gold_trainer.py
+# ------------------------------------------------------------
+# Stage 10C — Orchestrates gold-test-driven training:
+#   1) Alias trainer (DISABLED for safety)
+#   2) Schema expander (safe v10C)
+#   3) Signals trainer     (placeholder)
+#
+# This file MUST successfully import and expose train_from_gold().
+# ------------------------------------------------------------
+from __future__ import annotations
+from typing import Dict, Any
+# Safe schema expander
+from training.schema_expander import expand_schema
+# Placeholder signals trainer
+from training.signal_trainer import train_signals
+def train_from_gold() -> Dict[str, Any]:
+    """
+    Runs all gold-test–driven training components (Stage 10C).
+    Returns a dict:
+    {
+      "alias_trainer": {...},
+      "schema_expander": {...},
+      "signals_trainer": {...}
+    }
+    """
+    # --------------------------------------------------------
+    # 1) Alias Trainer — DISABLED to avoid destructive mappings
+    # --------------------------------------------------------
+    alias_result = {
+        "ok": False,
+        "message": (
+            "Alias trainer is disabled in Stage 10C to prevent unsafe "
+            "auto-mappings. Edit data/alias_maps.json manually if needed."
+        ),
+        "alias_map_path": "data/alias_maps.json",
+    }
+    # --------------------------------------------------------
+    # 2) Schema Expander — Safe version
+    # --------------------------------------------------------
+    try:
+        schema_result = expand_schema()
+    except Exception as e:
+        schema_result = {
+            "ok": False,
+            "message": f"Schema expander crashed: {e}",
+            "auto_added_fields": {},
+            "proposed_fields": [],
+            "schema_path": "data/extended_schema.json",
+            "proposals_path": "data/extended_proposals.jsonl",
+        }
+    # --------------------------------------------------------
+    # 3) Signals Trainer (placeholder)
+    # --------------------------------------------------------
+    try:
+        signals_result = train_signals()
+    except Exception as e:
+        signals_result = {
+            "ok": False,
+            "message": f"Signal trainer crashed: {e}",
+            "signals_catalog_path": "data/signals_catalog.json",
+        }
+    # --------------------------------------------------------
+    # Combined report
+    # --------------------------------------------------------
+    return {
+        "alias_trainer": alias_result,
+        "schema_expander": schema_result,
+        "signals_trainer": signals_result,
+    }

training/hf_sync.py ADDED Viewed

	@@ -0,0 +1,68 @@

+# training/hf_sync.py
+# ------------------------------------------------------------
+# Sync updated data files back to the SAME Hugging Face Space.
+# ------------------------------------------------------------
+import os
+from typing import List, Dict, Any
+from huggingface_hub import HfApi, CommitOperationAdd
+def push_to_hf(
+    paths: List[str],
+    commit_message: str = "train: update extended schema, aliases, signals from gold tests",
+) -> Dict[str, Any]:
+    repo_id = os.getenv("HF_SPACE_REPO_ID")
+    token = os.getenv("HF_TOKEN")
+    if not repo_id:
+        return {
+            "ok": False,
+            "error": "Missing HF_SPACE_REPO_ID environment variable.",
+            "uploaded": [],
+        }
+    if not token:
+        return {
+            "ok": False,
+            "error": "Missing HF_TOKEN environment variable.",
+            "uploaded": [],
+        }
+    api = HfApi()
+    operations = []
+    uploaded = []
+    for p in paths:
+        if not os.path.exists(p):
+            continue
+        operations.append(
+            CommitOperationAdd(path_in_repo=p, path_or_fileobj=p)
+        )
+        uploaded.append(p)
+    if not operations:
+        return {
+            "ok": False,
+            "error": "No existing files to upload.",
+            "uploaded": [],
+        }
+    commit_info = api.create_commit(
+        repo_id=repo_id,
+        repo_type="space",
+        operations=operations,
+        commit_message=commit_message,
+        token=token,
+    )
+    return {
+        "ok": True,
+        "uploaded": uploaded,
+        "repo_id": repo_id,
+        "commit_message": commit_message,
+        "commit_url": commit_info.commit_url,
+    }

training/parser_eval.py ADDED Viewed

	@@ -0,0 +1,104 @@

+# training/parser_eval.py
+# ------------------------------------------------------------
+# Parser Evaluation (Stage 10A)
+#
+# This version ONLY evaluates:
+#   - Rule parser
+#   - Extended parser
+#
+# The LLM parser is intentionally disabled at this stage
+# because alias maps and schema are not trained yet.
+#
+# This makes Stage 10A FAST and stable (< 3 seconds).
+# ------------------------------------------------------------
+import json
+import os
+from typing import Dict, Any
+from engine.parser_rules import parse_text_rules
+from engine.parser_ext import parse_text_extended
+# Path to the gold tests
+GOLD_PATH = "training/gold_tests.json"
+def evaluate_single_test(test: Dict[str, Any]) -> Dict[str, Any]:
+    """
+    Evaluate one gold test with rules + extended parsers.
+    """
+    text = test.get("input", "")
+    expected = test.get("expected", {})
+    # Run deterministic parsers
+    rule_out = parse_text_rules(text).get("parsed_fields", {})
+    ext_out = parse_text_extended(text).get("parsed_fields", {})
+    # Merge rule + extended (extended overwrites rules)
+    merged = dict(rule_out)
+    for k, v in ext_out.items():
+        if v != "Unknown":
+            merged[k] = v
+    total = len(expected)
+    correct = 0
+    wrong = {}
+    for field, exp_val in expected.items():
+        got = merged.get(field, "Unknown")
+        if got.lower() == exp_val.lower():
+            correct += 0 if exp_val == "Unknown" else 1   # Unknown is neutral
+        else:
+            wrong[field] = {"expected": exp_val, "got": got}
+    return {
+        "correct": correct,
+        "total": total,
+        "accuracy": correct / total if total else 0,
+        "wrong": wrong,
+        "merged": merged,
+    }
+def run_parser_eval(mode: str = "rules_extended") -> Dict[str, Any]:
+    """
+    Evaluate ALL gold tests using rules + extended parsing only.
+    """
+    if not os.path.exists(GOLD_PATH):
+        return {"error": f"Gold test file not found at {GOLD_PATH}"}
+    with open(GOLD_PATH, "r", encoding="utf-8") as f:
+        gold = json.load(f)
+    results = []
+    wrong_cases = []
+    total_correct = 0
+    total_fields = 0
+    for test in gold:
+        out = evaluate_single_test(test)
+        results.append(out)
+        total_correct += out["correct"]
+        total_fields += out["total"]
+        if out["wrong"]:
+            wrong_cases.append({
+                "name": test.get("name", "Unnamed"),
+                "wrong": out["wrong"],
+                "parsed": out["merged"],
+                "expected": test.get("expected", {})
+            })
+    summary = {
+        "mode": "rules+extended",
+        "tests": len(gold),
+        "total_correct": total_correct,
+        "total_fields": total_fields,
+        "overall_accuracy": total_correct / total_fields if total_fields else 0,
+        "wrong_cases": wrong_cases,
+    }
+    return summary

training/rag_index_builder.py ADDED Viewed

	@@ -0,0 +1,629 @@

+# training/rag_index_builder.py
+# ============================================================
+# Build RAG index from JSON knowledge base (SECTION-AWARE)
+#
+# - Walks data/rag/knowledge_base/<Genus>/
+# - Reads genus.json + species JSONs
+# - Converts JSON → structured SECTION records
+# - Computes embeddings via rag.rag_embedder.embed_texts
+# - Writes index to data/rag/index/kb_index.json
+#
+# Output record schema (LOCKED):
+# {
+#   "id": "Enterobacter|cloacae|species_markers|0",
+#   "level": "genus" | "species",
+#   "genus": "Enterobacter",
+#   "species": "cloacae" | null,
+#   "section": "...",
+#   "role": "...",
+#   "text": "...",
+#   "source_file": "...",
+#   "chunk_id": 0,
+#   "embedding": [...]
+# }
+#
+# NOTE:
+# We keep the locked keys above. We MAY add extra keys (non-breaking),
+# e.g. "field_key" to support future scoring/weighting.
+# ============================================================
+from __future__ import annotations
+import json
+import os
+import re
+from typing import Dict, Any, List, Tuple, Optional
+from rag.rag_embedder import embed_texts, EMBEDDING_MODEL_NAME
+KB_ROOT = os.path.join("data", "rag", "knowledge_base")
+INDEX_DIR = os.path.join("data", "rag", "index")
+INDEX_PATH = os.path.join(INDEX_DIR, "kb_index.json")
+# Chunk size is per-section. This should generally be smaller than the generator
+# prompt chunk budget so retriever can pick "tight" context blocks.
+DEFAULT_MAX_CHARS = int(os.getenv("BACTAI_RAG_CHUNK_MAX_CHARS", "1100"))
+# ------------------------------------------------------------
+# TEXT HELPERS
+# ------------------------------------------------------------
+def _norm_str(x: Any) -> str:
+    return str(x).strip() if x is not None else ""
+def _safe_join(items: List[str], sep: str = " ") -> str:
+    return sep.join([s for s in items if s])
+def _bullet_lines(items: List[str], prefix: str = "- ") -> str:
+    clean = [i.strip() for i in items if isinstance(i, str) and i.strip()]
+    if not clean:
+        return ""
+    return "\n".join(prefix + c for c in clean)
+def _title_case_field(field_name: str) -> str:
+    # Keep parser field names stable (don’t “prettify” them incorrectly)
+    return field_name.strip()
+def _format_expected_fields(expected_fields: Dict[str, Any]) -> str:
+    """
+    Turn your expected_fields into a compact, self-contained key:value block.
+    Handles strings, lists, and simple scalars.
+    """
+    if not isinstance(expected_fields, dict) or not expected_fields:
+        return ""
+    lines: List[str] = []
+    for k in sorted(expected_fields.keys(), key=lambda s: str(s).lower()):
+        key = _title_case_field(str(k))
+        v = expected_fields.get(k)
+        if isinstance(v, list):
+            vals = [str(x).strip() for x in v if str(x).strip()]
+            if vals:
+                lines.append(f"{key}: " + "; ".join(vals))
+            else:
+                lines.append(f"{key}: Unknown")
+        else:
+            val = _norm_str(v) or "Unknown"
+            lines.append(f"{key}: {val}")
+    return "\n".join(lines)
+def _as_list(v: Any) -> List[str]:
+    if isinstance(v, list):
+        return [str(x).strip() for x in v if str(x).strip()]
+    if isinstance(v, str) and v.strip():
+        return [v.strip()]
+    if v is None:
+        return []
+    s = str(v).strip()
+    return [s] if s else []
+def _is_unknown(v: str) -> bool:
+    return (v or "").strip().lower() in {"unknown", "not specified", "n/a", "na", ""}
+def _expected_fields_to_sentences(
+    expected_fields: Dict[str, Any],
+    *,
+    subject: str,
+) -> str:
+    """
+    Convert expected_fields into DECLARATIVE microbiology statements.
+    This is the key fix for "Not specified" RAG outputs:
+    LLMs treat these as evidence-like assertions rather than schema metadata.
+    """
+    if not isinstance(expected_fields, dict) or not expected_fields:
+        return ""
+    # Prefer these first (front-load the most diagnostic traits)
+    priority = [
+        "Gram Stain",
+        "Shape",
+        "Oxygen Requirement",
+        "Motility",
+        "Motility Type",
+        "Capsule",
+        "Spore Formation",
+        "Haemolysis",
+        "Haemolysis Type",
+        "Oxidase",
+        "Catalase",
+        "Indole",
+        "Urease",
+        "Citrate",
+        "Methyl Red",
+        "VP",
+        "H2S",
+        "ONPG",
+        "Nitrate Reduction",
+        "NaCl Tolerant (>=6%)",
+        "Growth Temperature",
+        "Media Grown On",
+        "Colony Morphology",
+        "Colony Pattern",
+        "Pigment",
+        "TSI Pattern",
+        "Gas Production",
+    ]
+    # Then everything else, stable order
+    all_keys = list(expected_fields.keys())
+    ordered = []
+    seen = set()
+    for k in priority:
+        if k in expected_fields:
+            ordered.append(k)
+            seen.add(k)
+    for k in sorted(all_keys, key=lambda s: str(s).lower()):
+        if k not in seen:
+            ordered.append(k)
+            seen.add(k)
+    lines: List[str] = []
+    subj = subject.strip() or "This organism"
+    for k in ordered:
+        key = _title_case_field(str(k))
+        raw = expected_fields.get(k)
+        if isinstance(raw, list):
+            vals = [x for x in _as_list(raw) if not _is_unknown(x)]
+            if not vals:
+                continue
+            # Special handling for list-like fields
+            if key == "Media Grown On":
+                lines.append(f"{subj} can grow on: " + ", ".join(vals) + ".")
+            elif key == "Colony Morphology":
+                lines.append(f"{subj} colonies are described as: " + ", ".join(vals) + ".")
+            else:
+                lines.append(f"{subj} {key} includes: " + ", ".join(vals) + ".")
+            continue
+        val = _norm_str(raw)
+        if _is_unknown(val):
+            continue
+        # Field-specific phrasing for better “evidence-like” feel
+        if key == "Gram Stain":
+            lines.append(f"{subj} is typically Gram {val}.")
+        elif key == "Shape":
+            lines.append(f"{subj} typically has shape: {val}.")
+        elif key == "Oxygen Requirement":
+            lines.append(f"{subj} is typically {val}.")
+        elif key == "Growth Temperature":
+            lines.append(f"{subj} typically grows within: {val} °C.")
+        elif key == "Haemolysis Type":
+            lines.append(f"{subj} haemolysis type is typically: {val}.")
+        elif key == "Haemolysis":
+            lines.append(f"{subj} haemolysis is typically: {val}.")
+        elif key == "Pigment":
+            if val.lower() in {"none", "no", "negative"}:
+                lines.append(f"{subj} typically produces no pigment.")
+            else:
+                lines.append(f"{subj} may produce pigment: {val}.")
+        elif key == "Colony Pattern":
+            lines.append(f"{subj} colony/cellular pattern may be described as: {val}.")
+        else:
+            # Default: simple assertive sentence
+            lines.append(f"{subj} {key} is typically: {val}.")
+    # If we emitted nothing, return empty so we don’t add noise
+    return "\n".join(lines).strip()
+def _format_key_differentiators(items: List[Dict[str, Any]]) -> str:
+    """
+    For genus-level key_differentiators.
+    """
+    if not isinstance(items, list) or not items:
+        return ""
+    out: List[str] = []
+    for obj in items:
+        if not isinstance(obj, dict):
+            continue
+        field = _norm_str(obj.get("field"))
+        expected = _norm_str(obj.get("expected"))
+        notes = _norm_str(obj.get("notes"))
+        distinguishes_from = obj.get("distinguishes_from") or []
+        if not field:
+            continue
+        line = f"{field}: expected {expected or 'Unknown'}."
+        if isinstance(distinguishes_from, list) and distinguishes_from:
+            line += " Distinguishes from: " + ", ".join([_norm_str(x) for x in distinguishes_from if _norm_str(x)])
+            if not line.endswith("."):
+                line += "."
+        if notes:
+            line += f" Notes: {notes}"
+            if not line.endswith("."):
+                line += "."
+        out.append(line)
+    return "\n".join(out)
+def _format_common_confusions(items: List[Dict[str, Any]], level: str) -> str:
+    """
+    For genus/species common_confusions.
+    """
+    if not isinstance(items, list) or not items:
+        return ""
+    out: List[str] = []
+    for obj in items:
+        if not isinstance(obj, dict):
+            continue
+        reason = _norm_str(obj.get("reason"))
+        if level == "genus":
+            who = _norm_str(obj.get("genus"))
+            if who:
+                out.append(f"{who}: {reason or 'Reason not specified.'}")
+        else:
+            who = _norm_str(obj.get("species")) or _norm_str(obj.get("genus"))
+            if who:
+                out.append(f"{who}: {reason or 'Reason not specified.'}")
+    return "\n".join(out)
+def _format_recommended_next_tests(items: List[Dict[str, Any]]) -> str:
+    """
+    For recommended_next_tests with optional API kit note.
+    """
+    if not isinstance(items, list) or not items:
+        return ""
+    out: List[str] = []
+    for obj in items:
+        if not isinstance(obj, dict):
+            continue
+        test = _norm_str(obj.get("test"))
+        reason = _norm_str(obj.get("reason"))
+        api_kit = _norm_str(obj.get("api_kit"))
+        if not test:
+            continue
+        line = f"{test}"
+        if api_kit:
+            line += f" (API kit: {api_kit})"
+        if reason:
+            line += f": {reason}"
+        out.append(line)
+    return "\n".join(out)
+# ------------------------------------------------------------
+# CHUNKING (SECTION-LOCAL)
+# ------------------------------------------------------------
+def chunk_text_by_paragraph(text: str, max_chars: int = DEFAULT_MAX_CHARS) -> List[str]:
+    """
+    Chunk within a single section. We never merge different sections together.
+    """
+    text = (text or "").strip()
+    if not text:
+        return []
+    if len(text) <= max_chars:
+        return [text]
+    paras = [p.strip() for p in re.split(r"\n\s*\n", text) if p.strip()]
+    if not paras:
+        paras = [l.strip() for l in text.splitlines() if l.strip()]
+    chunks: List[str] = []
+    current = ""
+    for p in paras:
+        candidate = (current + "\n\n" + p).strip() if current else p
+        if len(candidate) <= max_chars:
+            current = candidate
+        else:
+            if current:
+                chunks.append(current)
+            if len(p) <= max_chars:
+                current = p
+            else:
+                for i in range(0, len(p), max_chars):
+                    chunks.append(p[i:i + max_chars].strip())
+                current = ""
+    if current:
+        chunks.append(current)
+    return [c for c in chunks if c.strip()]
+# ------------------------------------------------------------
+# SECTION EMITTERS
+# ------------------------------------------------------------
+def emit_genus_sections(doc: Dict[str, Any], genus: str) -> List[Dict[str, Any]]:
+    """
+    Convert genus.json to a list of {section, role, text} entries.
+    """
+    out: List[Dict[str, Any]] = []
+    overview = doc.get("overview") or {}
+    if isinstance(overview, dict):
+        short = _norm_str(overview.get("short"))
+        clinical = _norm_str(overview.get("clinical_context"))
+        if short:
+            out.append({"section": "overview", "role": "description", "text": f"Genus {genus}: {short}"})
+        if clinical:
+            out.append({"section": "overview", "role": "description", "text": f"Clinical context: {clinical}"})
+    expected_fields = doc.get("expected_fields")
+    if isinstance(expected_fields, dict) and expected_fields:
+        # 1) Declarative evidence-like sentences (NEW)
+        sent = _expected_fields_to_sentences(expected_fields, subject=f"Genus {genus}")
+        if sent:
+            out.append({
+                "section": "expected_profile_sentences",
+                "role": "expected_profile",
+                "text": sent,
+            })
+        # 2) Keep original key:value block (still useful)
+        text = _format_expected_fields(expected_fields)
+        if text:
+            out.append({
+                "section": "expected_fields",
+                "role": "expected_profile",
+                "text": f"Expected fields for genus {genus}:\n{text}",
+            })
+    field_notes = doc.get("field_notes")
+    if isinstance(field_notes, dict) and field_notes:
+        lines: List[str] = []
+        for k in sorted(field_notes.keys(), key=lambda s: str(s).lower()):
+            v = _norm_str(field_notes.get(k))
+            if v:
+                lines.append(f"{_title_case_field(str(k))}: {v}")
+        if lines:
+            out.append({"section": "field_notes", "role": "clarification", "text": "Field notes:\n" + "\n".join(lines)})
+    kd = doc.get("key_differentiators")
+    if isinstance(kd, list) and kd:
+        text = _format_key_differentiators(kd)
+        if text:
+            out.append({"section": "key_differentiators", "role": "differentiation", "text": "Key differentiators:\n" + text})
+    conf = doc.get("common_confusions")
+    if isinstance(conf, list) and conf:
+        text = _format_common_confusions(conf, level="genus")
+        if text:
+            out.append({"section": "common_confusions", "role": "warning", "text": "Common confusions:\n" + text})
+    wq = doc.get("when_to_question_identification")
+    if isinstance(wq, list) and wq:
+        lines = [str(x).strip() for x in wq if str(x).strip()]
+        if lines:
+            out.append({"section": "when_to_question_identification", "role": "warning", "text": "When to question identification:\n" + _bullet_lines(lines)})
+    rnt = doc.get("recommended_next_tests")
+    if isinstance(rnt, list) and rnt:
+        text = _format_recommended_next_tests(rnt)
+        if text:
+            out.append({"section": "recommended_next_tests", "role": "recommendation", "text": "Recommended next tests:\n" + text})
+    ss = doc.get("supported_species")
+    if isinstance(ss, list) and ss:
+        species_list = [str(x).strip() for x in ss if str(x).strip()]
+        if species_list:
+            out.append({"section": "supported_species", "role": "metadata", "text": f"Supported species for genus {genus}: " + ", ".join(species_list)})
+    return out
+def emit_species_sections(doc: Dict[str, Any], genus: str, species: str) -> List[Dict[str, Any]]:
+    """
+    Convert a species JSON to a list of {section, role, text} entries.
+    """
+    out: List[Dict[str, Any]] = []
+    overview = doc.get("overview") or {}
+    if isinstance(overview, dict):
+        short = _norm_str(overview.get("short"))
+        clinical = _norm_str(overview.get("clinical_context"))
+        if short:
+            out.append({"section": "overview", "role": "description", "text": f"Species {genus} {species}: {short}"})
+        if clinical:
+            out.append({"section": "overview", "role": "description", "text": f"Clinical context: {clinical}"})
+    expected_fields = doc.get("expected_fields")
+    if isinstance(expected_fields, dict) and expected_fields:
+        # 1) Declarative evidence-like sentences (NEW)
+        sent = _expected_fields_to_sentences(expected_fields, subject=f"Species {genus} {species}")
+        if sent:
+            out.append({
+                "section": "expected_profile_sentences",
+                "role": "expected_profile",
+                "text": sent,
+            })
+        # 2) Keep original key:value block
+        text = _format_expected_fields(expected_fields)
+        if text:
+            out.append({"section": "expected_fields", "role": "expected_profile", "text": f"Expected fields for species {genus} {species}:\n{text}"})
+    markers = doc.get("species_markers")
+    if isinstance(markers, list) and markers:
+        lines: List[str] = []
+        for m in markers:
+            if not isinstance(m, dict):
+                continue
+            field = _norm_str(m.get("field"))
+            val = _norm_str(m.get("value"))
+            importance = _norm_str(m.get("importance"))
+            notes = _norm_str(m.get("notes"))
+            if not field:
+                continue
+            line = f"{field}: {val or 'Unknown'}"
+            if importance:
+                line += f" (importance: {importance})"
+            if notes:
+                line += f" — {notes}"
+            lines.append(line)
+        if lines:
+            out.append({"section": "species_markers", "role": "species_marker", "text": "Species markers:\n" + "\n".join(lines)})
+    conf = doc.get("common_confusions")
+    if isinstance(conf, list) and conf:
+        text = _format_common_confusions(conf, level="species")
+        if text:
+            out.append({"section": "common_confusions", "role": "warning", "text": "Common confusions:\n" + text})
+    wq = doc.get("when_to_question_identification")
+    if isinstance(wq, list) and wq:
+        lines = [str(x).strip() for x in wq if str(x).strip()]
+        if lines:
+            out.append({"section": "when_to_question_identification", "role": "warning", "text": "When to question identification:\n" + _bullet_lines(lines)})
+    rnt = doc.get("recommended_next_tests")
+    if isinstance(rnt, list) and rnt:
+        text = _format_recommended_next_tests(rnt)
+        if text:
+            out.append({"section": "recommended_next_tests", "role": "recommendation", "text": "Recommended next tests:\n" + text})
+    return out
+# ------------------------------------------------------------
+# INDEX BUILD
+# ------------------------------------------------------------
+def _iter_kb_files() -> List[Tuple[str, str]]:
+    entries: List[Tuple[str, str]] = []
+    if not os.path.isdir(KB_ROOT):
+        return entries
+    for genus in sorted(os.listdir(KB_ROOT)):
+        genus_dir = os.path.join(KB_ROOT, genus)
+        if not os.path.isdir(genus_dir):
+            continue
+        for fname in sorted(os.listdir(genus_dir)):
+            if fname.lower().endswith(".json"):
+                entries.append((genus, os.path.join(genus_dir, fname)))
+    return entries
+def build_rag_index(max_chars: int = DEFAULT_MAX_CHARS) -> Dict[str, Any]:
+    os.makedirs(INDEX_DIR, exist_ok=True)
+    kb_entries = _iter_kb_files()
+    if not kb_entries:
+        return {"ok": False, "message": "No KB JSON files found."}
+    docs_for_embedding: List[str] = []
+    meta: List[Dict[str, Any]] = []
+    num_json_errors = 0
+    for genus_dir_name, path in kb_entries:
+        with open(path, "r", encoding="utf-8") as f:
+            try:
+                doc = json.load(f)
+            except json.JSONDecodeError as e:
+                print(f"[rag_index_builder] JSON error in {path}: {e}")
+                num_json_errors += 1
+                continue
+        fname = os.path.basename(path)
+        is_genus = fname == "genus.json"
+        genus = _norm_str(doc.get("genus")) or genus_dir_name
+        level = "genus" if is_genus else "species"
+        species: Optional[str]
+        if is_genus:
+            species = None
+            sections = emit_genus_sections(doc, genus=genus)
+        else:
+            species = _norm_str(doc.get("species")) or os.path.splitext(fname)[0]
+            sections = emit_species_sections(doc, genus=genus, species=species)
+        for sec in sections:
+            section = _norm_str(sec.get("section"))
+            role = _norm_str(sec.get("role"))
+            text = _norm_str(sec.get("text"))
+            if not section or not role or not text:
+                continue
+            chunks = chunk_text_by_paragraph(text, max_chars=max_chars)
+            for idx, chunk in enumerate(chunks):
+                if not chunk.strip():
+                    continue
+                rec_id = f"{genus}|{species or 'GENUS'}|{section}|{idx}"
+                docs_for_embedding.append(chunk)
+                meta.append(
+                    {
+                        "id": rec_id,
+                        "level": level,
+                        "genus": genus,
+                        "species": species,
+                        "section": section,
+                        "role": role,
+                        "text": chunk,
+                        "source_file": os.path.relpath(path),
+                        "chunk_id": idx,
+                        # Optional: helps later for field-level weighting
+                        "field_key": None,
+                    }
+                )
+    if not docs_for_embedding:
+        return {
+            "ok": False,
+            "message": "No valid sections emitted from KB JSON files. Check schema/contents.",
+            "num_files": len(kb_entries),
+            "num_json_errors": num_json_errors,
+        }
+    embeddings = embed_texts(docs_for_embedding, normalize=True)
+    index_records: List[Dict[str, Any]] = []
+    for m, emb in zip(meta, embeddings):
+        rec = dict(m)
+        rec["embedding"] = emb.tolist()
+        index_records.append(rec)
+    with open(INDEX_PATH, "w", encoding="utf-8") as f:
+        json.dump(
+            {
+                "version": 2,
+                "model_name": EMBEDDING_MODEL_NAME,
+                "record_schema": {
+                    "id": "str",
+                    "level": "genus|species",
+                    "genus": "str",
+                    "species": "str|null",
+                    "section": "str",
+                    "role": "str",
+                    "text": "str",
+                    "source_file": "str",
+                    "chunk_id": "int",
+                    "embedding": "list[float]",
+                },
+                "stats": {
+                    "num_files": len(kb_entries),
+                    "num_records": len(index_records),
+                    "num_json_errors": num_json_errors,
+                    "chunk_max_chars": max_chars,
+                },
+                "records": index_records,
+            },
+            f,
+            ensure_ascii=False,
+        )
+    return {
+        "ok": True,
+        "message": "RAG index built successfully (section-aware, declarative expected profiles).",
+        "index_path": INDEX_PATH,
+        "num_records": len(index_records),
+        "num_files": len(kb_entries),
+        "num_json_errors": num_json_errors,
+        "chunk_max_chars": max_chars,
+    }
+if __name__ == "__main__":
+    summary = build_rag_index()
+    print(json.dumps(summary, indent=2))

training/schema_expander.py ADDED Viewed

	@@ -0,0 +1,237 @@

+# training/schema_expander.py
+# ------------------------------------------------------------
+# Stage 10C — SAFE schema expansion
+#
+# Core fields = EXACT columns in bacteria_db.xlsx.
+# Extended fields = ONLY the ones NOT in DB and NOT in existing schema.
+#
+# This version:
+#  - NEVER adds core fields to extended schema.
+#  - Only adds true extended fields found in gold tests.
+#  - Logs ambiguous or rare fields to proposals file.
+#  - Reports field frequencies & values seen for debugging.
+# ------------------------------------------------------------
+from __future__ import annotations
+import os
+import json
+from typing import Dict, Any, List
+from collections import Counter
+from datetime import datetime
+import pandas as pd
+from engine.schema import (
+    load_extended_schema,
+    save_extended_schema,
+)
+# ------------------------------------------------------------
+# Paths
+# ------------------------------------------------------------
+GOLD_PATH = "training/gold_tests.json"
+EXTENDED_SCHEMA_PATH = "data/extended_schema.json"
+PROPOSALS_PATH = "data/extended_proposals.jsonl"
+# Minimum frequency before auto-adding a new extended field
+MIN_FIELD_FREQ = 5
+# ------------------------------------------------------------
+# Helper: load gold tests
+# ------------------------------------------------------------
+def _load_gold_tests() -> List[Dict[str, Any]]:
+    if not os.path.exists(GOLD_PATH):
+        return []
+    with open(GOLD_PATH, "r", encoding="utf-8") as f:
+        try:
+            data = json.load(f)
+            return data if isinstance(data, list) else []
+        except Exception:
+            return []
+# ------------------------------------------------------------
+# Helper: load DB columns (TRUE core schema)
+# ------------------------------------------------------------
+def _load_db_columns() -> List[str]:
+    candidates = [
+        os.path.join("data", "bacteria_db.xlsx"),
+        "bacteria_db.xlsx",
+    ]
+    for p in candidates:
+        if os.path.exists(p):
+            try:
+                df = pd.read_excel(p)
+                return [c.strip() for c in df.columns]
+            except Exception:
+                continue
+    return []
+# ------------------------------------------------------------
+# Decide if field name is safe for auto-adding
+# ------------------------------------------------------------
+def _is_safe_field_name(name: str) -> bool:
+    n = name.strip()
+    if not n:
+        return False
+    low = n.lower()
+    # Ignore extremely short or generic names
+    if len(n) < 4:
+        return False
+    if low in {"test", "growth", "acid", "base", "value", "result"}:
+        return False
+    # Clear biochemical patterns
+    patterns = [
+        "hydrolysis",
+        "fermentation",
+        "decarboxylase",
+        "dihydrolase",
+        "reduction",
+        "utilization",
+        "tolerance",
+        "solubility",
+        "oxidation",
+        "lysis",
+        "susceptibility",
+        "resistance",
+        "pyruvate",
+        "lecithinase",
+        "lipase",
+        "casein",
+        "hippurate",
+        "tyrosine",
+    ]
+    if any(pat in low for pat in patterns):
+        return True
+    # Known short disc tests
+    known_short = {"CAMP", "PYR", "Optochin", "Bacitracin", "Novobiocin"}
+    if n in known_short:
+        return True
+    # If contains "test" and more than one word → likely legitimate
+    if "test" in low and " " in low:
+        return True
+    return False
+# ------------------------------------------------------------
+# Log proposal (rare/ambiguous fields)
+# ------------------------------------------------------------
+def _append_proposal(record: Dict[str, Any]) -> None:
+    os.makedirs(os.path.dirname(PROPOSALS_PATH), exist_ok=True)
+    with open(PROPOSALS_PATH, "a", encoding="utf-8") as f:
+        f.write(json.dumps(record, ensure_ascii=False) + "\n")
+# ------------------------------------------------------------
+# MAIN ENTRY — SAFE SCHEMA EXPANSION
+# ------------------------------------------------------------
+def expand_schema() -> Dict[str, Any]:
+    gold = _load_gold_tests()
+    if not gold:
+        return {
+            "ok": False,
+            "message": f"No gold tests found at {GOLD_PATH}",
+            "auto_added_fields": {},
+            "proposed_fields": [],
+            "schema_path": EXTENDED_SCHEMA_PATH,
+            "proposals_path": PROPOSALS_PATH,
+            "unknown_fields_raw": {},
+            "field_frequencies": {},
+        }
+    db_columns = set(_load_db_columns())               # TRUE core schema
+    extended_schema = load_extended_schema(EXTENDED_SCHEMA_PATH)
+    extended_fields = set(extended_schema.keys())
+    # Counter for unknown fields
+    field_counts: Counter[str] = Counter()
+    field_values: Dict[str, Counter[str]] = {}
+    for test in gold:
+        expected = test.get("expected", {})
+        if not isinstance(expected, dict):
+            continue
+        for field, value in expected.items():
+            fname = str(field).strip()
+            if not fname:
+                continue
+            # Skip core DB fields
+            if fname in db_columns:
+                continue
+            # Skip already-known extended fields
+            if fname in extended_fields:
+                continue
+            # Count unknowns
+            field_counts[fname] += 1
+            if fname not in field_values:
+                field_values[fname] = Counter()
+            field_values[fname][str(value).strip()] += 1
+    auto_added: Dict[str, Any] = {}
+    proposed: List[Dict[str, Any]] = []
+    # Decide which unknown fields to auto-add
+    for fname, freq in field_counts.items():
+        values_seen = dict(field_values.get(fname, {}))
+        if freq >= MIN_FIELD_FREQ and _is_safe_field_name(fname):
+            # Auto-add as extended test
+            extended_schema[fname] = {
+                "value_type": "enum_PNV",
+                "description": "Auto-added from gold tests (Stage 10C)",
+                "values": list(values_seen.keys()),
+            }
+            auto_added[fname] = {
+                "freq": freq,
+                "values_seen": list(values_seen.keys()),
+            }
+        else:
+            # Log proposal for later review
+            proposed.append(
+                {
+                    "field_name": fname,
+                    "freq": freq,
+                    "values_seen": values_seen,
+                }
+            )
+            _append_proposal(
+                {
+                    "timestamp": datetime.utcnow().isoformat() + "Z",
+                    "field_name": fname,
+                    "freq": freq,
+                    "values_seen": values_seen,
+                }
+            )
+    # Save updated schema
+    if auto_added:
+        save_extended_schema(extended_schema, EXTENDED_SCHEMA_PATH)
+    return {
+        "ok": True,
+        "auto_added_fields": auto_added,
+        "proposed_fields": proposed,
+        "schema_path": EXTENDED_SCHEMA_PATH,
+        "proposals_path": PROPOSALS_PATH,
+        "unknown_fields_raw": {f: dict(cnt) for f, cnt in field_values.items()},
+        "field_frequencies": dict(field_counts),
+    }

training/signal_trainer.py ADDED Viewed

	@@ -0,0 +1,35 @@

+# training/signal_trainer.py
+# ------------------------------------------------------------
+# Stage 10C placeholder:
+# Safely returns a no-op result for signal training.
+# This MUST NOT crash during import.
+# ------------------------------------------------------------
+from __future__ import annotations
+from typing import Dict, Any
+import json
+import os
+SIGNALS_PATH = "data/signals_catalog.json"
+def train_signals() -> Dict[str, Any]:
+    """
+    Placeholder trainer. Does nothing except ensure signals_catalog.json exists.
+    Must NEVER crash.
+    """
+    # Ensure signals catalog exists
+    if not os.path.exists(SIGNALS_PATH):
+        try:
+            with open(SIGNALS_PATH, "w", encoding="utf-8") as f:
+                json.dump({}, f, indent=2, ensure_ascii=False)
+        except Exception:
+            pass
+    return {
+        "ok": True,
+        "message": "Signal trainer not implemented yet (Stage 10C placeholder).",
+        "signals_catalog_path": SIGNALS_PATH,
+    }