Spaces:

angelperedo01
/

Safety-Model-Live-Eval

Sleeping

App Files Files Community

angelperedo01 commited on 26 days ago

Commit

0bcf758

verified ·

1 Parent(s): 68c0f61

Update app.py

Browse files

Files changed (1) hide show

app.py +61 -38

app.py CHANGED Viewed

@@ -9,16 +9,15 @@ import numpy as np
 # REPLACE THIS WITH YOUR UPLOADED MODEL NAME!
 MODEL_REPO = "angelperedo01/proj2"
 DATASET_NAME = "nvidia/Aegis-AI-Content-Safety-Dataset-2.0"
-MAX_SAMPLES = 200  # Limit samples for the demo so it doesn't take hours
 def get_text_and_label(example):
     """
-    Your custom logic to parse the NVIDIA dataset labels.
     """
     text = example.get('prompt', '')
     label = None
-    # Try 'prompt_label' first
     if 'prompt_label' in example:
         raw_label = example['prompt_label']
         if isinstance(raw_label, str):
@@ -33,13 +32,12 @@ def get_text_and_label(example):
         else:
             label = int(raw_label)
-    # Default to Safe (0) if we really can't find it
     if label is None: label = 0
     return text, label
-def run_live_evaluation(progress=gr.Progress()):
     # 1. Load Model & Data
-    yield "Loading Model from Hub...", "-", "-", []
     try:
         tokenizer = AutoTokenizer.from_pretrained(MODEL_REPO)
@@ -48,81 +46,106 @@ def run_live_evaluation(progress=gr.Progress()):
         model.to(device)
         model.eval()
     except Exception as e:
-        yield f"Error loading model: {str(e)}", "Error", "Error", []
         return
-    # Load Dataset (Streaming or small slice for speed)
-    yield "Loading NVIDIA Dataset...", "-", "-", []
     try:
-        # Try test split, fallback to train
         ds = load_dataset(DATASET_NAME, split="test")
     except:
         ds = load_dataset(DATASET_NAME, split="train")
-    # Shuffle and select subset for the demo
     ds = ds.shuffle(seed=42).select(range(MAX_SAMPLES))
     true_labels = []
     predictions = []
-    logs = [] # To store misclassifications
     # 2. The Evaluation Loop
-    for i, item in enumerate(progress.tqdm(ds, desc="Evaluating...")):
         text, true_label = get_text_and_label(item)
         true_labels.append(true_label)
-        # Tokenize
-        inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=256).to(device)
         # Predict
         with torch.no_grad():
             logits = model(**inputs).logits
             pred = torch.argmax(logits, dim=-1).item()
             predictions.append(pred)
-        # Log Errors (If prediction is wrong)
-        if pred != true_label:
-            status = "🔴 MISS"
-            logs.insert(0, [status, text[:80] + "...", "Safe" if true_label==0 else "Unsafe", "Safe" if pred==0 else "Unsafe"])
         else:
-            # Optional: Log successes too if you want, but it clutters the view
-            pass
-        # Update UI every 5 steps
-        if i % 5 == 0 or i == len(ds)-1:
             acc = accuracy_score(true_labels, predictions)
             f1 = f1_score(true_labels, predictions, zero_division=0)
-            status_msg = f"Processed {i+1}/{MAX_SAMPLES}"
-            yield status_msg, f"{acc:.2%}", f"{f1:.2f}", logs[:10] # Show last 10 errors
 # --- UI LAYOUT ---
 with gr.Blocks(theme=gr.themes.Soft()) as demo:
-    gr.Markdown(f"## 🛡️ Live Safety Model Evaluation")
-    gr.Markdown(f"Running `{MODEL_REPO}` on `{DATASET_NAME}` (Live Inference)")
     with gr.Row():
-        start_btn = gr.Button("▶️ Start Live Test", variant="primary")
     with gr.Row():
         with gr.Column():
             status_box = gr.Label(value="Ready", label="Status")
         with gr.Column():
-            acc_box = gr.Label(value="-", label="Current Accuracy")
         with gr.Column():
-            f1_box = gr.Label(value="-", label="Current F1 Score")
-    gr.Markdown("### 🚨 Recent Misclassifications (Live Feed)")
-    log_table = gr.Dataframe(
-        headers=["Status", "Text Snippet", "True Label", "Predicted"],
         datatype=["str", "str", "str", "str"],
-        row_count=10
     )
     start_btn.click(
-        fn=run_live_evaluation,
         inputs=None,
-        outputs=[status_box, acc_box, f1_box, log_table]
     )
 demo.queue().launch()

 # REPLACE THIS WITH YOUR UPLOADED MODEL NAME!
 MODEL_REPO = "angelperedo01/proj2"
 DATASET_NAME = "nvidia/Aegis-AI-Content-Safety-Dataset-2.0"
+MAX_SAMPLES = 300  # Increased slightly since we aren't rendering the table live
 def get_text_and_label(example):
     """
+    Parses the NVIDIA dataset labels.
     """
     text = example.get('prompt', '')
     label = None
     if 'prompt_label' in example:
         raw_label = example['prompt_label']
         if isinstance(raw_label, str):
         else:
             label = int(raw_label)
     if label is None: label = 0
     return text, label
+def run_evaluation(progress=gr.Progress()):
     # 1. Load Model & Data
+    yield "Loading Model...", "-", "-", []
     try:
         tokenizer = AutoTokenizer.from_pretrained(MODEL_REPO)
         model.to(device)
         model.eval()
     except Exception as e:
+        yield f"Error: {str(e)}", "Error", "Error", []
         return
+    yield "Loading Dataset...", "-", "-", []
     try:
         ds = load_dataset(DATASET_NAME, split="test")
     except:
         ds = load_dataset(DATASET_NAME, split="train")
+    # Shuffle and select subset
     ds = ds.shuffle(seed=42).select(range(MAX_SAMPLES))
     true_labels = []
     predictions = []
+    # Store full details to filter later
+    # Structure: [Status, Text, True, Pred]
+    history_correct = []
+    history_incorrect = []
     # 2. The Evaluation Loop
+    # We yield updates less frequently to prevent UI flashing
+    for i, item in enumerate(progress.tqdm(ds, desc="Classifying...")):
         text, true_label = get_text_and_label(item)
         true_labels.append(true_label)
         # Predict
+        inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=256).to(device)
         with torch.no_grad():
             logits = model(**inputs).logits
             pred = torch.argmax(logits, dim=-1).item()
             predictions.append(pred)
+        # Store for final report
+        label_map = {0: "Safe", 1: "Unsafe"}
+        entry = [
+            text,
+            label_map[true_label],
+            label_map[pred]
+        ]
+        if pred == true_label:
+            history_correct.append(["✅ Correct"] + entry)
         else:
+            history_incorrect.append(["🔴 WRONG"] + entry)
+        # Update metrics every 10 steps (Reduces flashing)
+        if i % 10 == 0:
             acc = accuracy_score(true_labels, predictions)
             f1 = f1_score(true_labels, predictions, zero_division=0)
+            # Yield empty list for table so it doesn't try to render anything yet
+            yield f"Processed {i+1}/{MAX_SAMPLES}", f"{acc:.2%}", f"{f1:.2f}", []
+    # 3. Final Compilation
+    # Grab last 10 incorrect and last 10 correct
+    final_display_data = []
+    # Add header/separator logic if you want, or just mix them
+    # We prioritize showing errors first
+    if history_incorrect:
+        final_display_data.extend(history_incorrect[-10:]) # Last 10 errors
+    if history_correct:
+        final_display_data.extend(history_correct[-10:])   # Last 10 correct
+    final_acc = accuracy_score(true_labels, predictions)
+    final_f1 = f1_score(true_labels, predictions, zero_division=0)
+    yield "Evaluation Complete!", f"{final_acc:.2%}", f"{final_f1:.2f}", final_display_data
 # --- UI LAYOUT ---
 with gr.Blocks(theme=gr.themes.Soft()) as demo:
+    gr.Markdown(f"## 🛡️ Model Safety Evaluation Dashboard")
+    gr.Markdown(f"Testing `{MODEL_REPO}` on `{DATASET_NAME}`")
     with gr.Row():
+        start_btn = gr.Button("▶️ Run Live Test", variant="primary")
     with gr.Row():
         with gr.Column():
             status_box = gr.Label(value="Ready", label="Status")
         with gr.Column():
+            acc_box = gr.Label(value="-", label="Accuracy")
         with gr.Column():
+            f1_box = gr.Label(value="-", label="F1 Score")
+    gr.Markdown("### 📝 Final Report: Sample of Results")
+    gr.Markdown("*(Showing last 10 Incorrect and last 10 Correct predictions)*")
+    # Defined table but it stays empty until the end
+    result_table = gr.Dataframe(
+        headers=["Result", "Text Snippet", "True Label", "Predicted"],
         datatype=["str", "str", "str", "str"],
+        wrap=True
     )
     start_btn.click(
+        fn=run_evaluation,
         inputs=None,
+        outputs=[status_box, acc_box, f1_box, result_table]
     )
 demo.queue().launch()