Spaces:

sagawa
/

ReactionT5

Running

App Files Files Community

sagawa commited on Aug 22

Commit

64b6831

verified ·

1 Parent(s): fa55e0e

Update app.py

Browse files

Files changed (1) hide show

app.py +84 -62

app.py CHANGED Viewed

@@ -1,6 +1,5 @@
 import gc
 import os
-import sys
 import warnings
 from types import SimpleNamespace
@@ -14,6 +13,7 @@ from generation_utils import (
     decode_output,
     save_multiple_predictions,
 )
 from torch.utils.data import DataLoader
 from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
 from utils import seed_everything
@@ -111,6 +111,8 @@ with st.sidebar:
         model_options = ["sagawa/ReactionT5v2-yield"]  # default as requested
         model_help = "Default model for yield prediction."
         input_max_length_default = 400
     model_name_or_path = st.selectbox(
         "Model",
@@ -118,15 +120,15 @@ with st.sidebar:
         index=0,
         help=model_help,
     )
-    num_beams = st.slider(
-        "Beam size",
-        min_value=1,
-        max_value=10,
-        value=5,
-        step=1,
-        help="Number of beams for beam search.",
-    )
     seed = st.number_input(
         "Random seed",
@@ -187,9 +189,12 @@ def load_tokenizer(model_ref: str):
 @st.cache_resource(show_spinner=True)
-def load_model(model_ref: str, device_str: str):
     resolved = os.path.abspath(model_ref) if os.path.exists(model_ref) else model_ref
-    model = AutoModelForSeq2SeqLM.from_pretrained(resolved)
     model.to(torch.device(device_str))
     model.eval()
     return model
@@ -253,14 +258,22 @@ if run:
     else:
         # Build config object expected by your dataset/utils
         CFG = SimpleNamespace(
-            num_beams=int(num_beams),
-            num_return_sequences=int(num_beams),  # tie to beams by default
             model_name_or_path=model_name_or_path,
             input_column="input",
-            input_max_length=int(input_max_length),
-            output_max_length=int(output_max_length),
-            output_min_length=int(output_min_length),
-            model="t5",
             seed=int(seed),
             batch_size=int(batch_size),
         )
@@ -272,7 +285,7 @@ if run:
             try:
                 tokenizer = load_tokenizer(CFG.model_name_or_path)
                 CFG.tokenizer = tokenizer
-                model = load_model(CFG.model_name_or_path, device.type)
                 status.update(label="Model ready.", state="complete")
             except Exception as e:
                 st.session_state["last_error"] = f"Failed to load model: {e}"
@@ -296,51 +309,60 @@ if run:
             drop_last=False,
         )
-        # Generation loop with progress
-        all_sequences, all_scores = [], []
-        total = len(dataloader)
-        progress = st.progress(0, text="Generating predictions...")
-        info_placeholder = st.empty()
-        for i, inputs in enumerate(dataloader, start=1):
-            inputs = {k: v.to(device) for k, v in inputs.items()}
-            with torch.no_grad():
-                output = model.generate(
-                    **inputs,
-                    min_length=CFG.output_min_length,
-                    max_length=CFG.output_max_length,
-                    num_beams=CFG.num_beams,
-                    num_return_sequences=CFG.num_return_sequences,
-                    return_dict_in_generate=True,
-                    output_scores=True,
-                )
-            sequences, scores = decode_output(output, CFG)
-            all_sequences.extend(sequences)
-            if scores:
-                all_scores.extend(scores)
-            del output
-            if device.type == "cuda":
-                torch.cuda.empty_cache()
-            gc.collect()
-            progress.progress(i / total, text=f"Generating predictions... {i}/{total}")
-            info_placeholder.caption(f"Processed batch {i} of {total}")
-        progress.empty()
-        info_placeholder.empty()
-        # Save predictions
-        try:
-            output_df = save_multiple_predictions(
-                input_df, all_sequences, all_scores, CFG
-            )
             st.session_state["results_df"] = output_df
             st.success("Prediction complete.")
-        except Exception as e:
-            st.session_state["last_error"] = f"Failed to assemble output: {e}"
-            st.error(st.session_state["last_error"])
-            st.stop()
 # ------------------------------
 # Results

 import gc
 import os
 import warnings
 from types import SimpleNamespace
     decode_output,
     save_multiple_predictions,
 )
+from models import ReactionT5Yield2
 from torch.utils.data import DataLoader
 from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
 from utils import seed_everything
         model_options = ["sagawa/ReactionT5v2-yield"]  # default as requested
         model_help = "Default model for yield prediction."
         input_max_length_default = 400
+        from task_yield.train import preprocess_df
+        from task_yield.prediction import inference_fn
     model_name_or_path = st.selectbox(
         "Model",
         index=0,
         help=model_help,
     )
+    if task != "yield prediction":
+        num_beams = st.slider(
+            "Beam size",
+            min_value=1,
+            max_value=10,
+            value=5,
+            step=1,
+            help="Number of beams for beam search.",
+        )
     seed = st.number_input(
         "Random seed",
 @st.cache_resource(show_spinner=True)
+def load_model(model_ref: str, device_str: str, task: str):
     resolved = os.path.abspath(model_ref) if os.path.exists(model_ref) else model_ref
+    if task != "yield prediction":
+        model = AutoModelForSeq2SeqLM.from_pretrained(resolved)
+    else:
+        model = ReactionT5Yield2.from_pretrained(resolved)
     model.to(torch.device(device_str))
     model.eval()
     return model
     else:
         # Build config object expected by your dataset/utils
         CFG = SimpleNamespace(
+            task=task,
+            num_beams=int(num_beams) if task != "yield prediction" else None,
+            num_return_sequences=int(num_beams)
+            if task != "yield prediction"
+            else None,  # tie to beams by default
             model_name_or_path=model_name_or_path,
             input_column="input",
+            input_max_length=int(input_max_length)
+            if task != "yield prediction"
+            else None,
+            output_max_length=int(output_max_length)
+            if task != "yield prediction"
+            else None,
+            output_min_length=int(output_min_length)
+            if task != "yield prediction"
+            else None,
             seed=int(seed),
             batch_size=int(batch_size),
         )
             try:
                 tokenizer = load_tokenizer(CFG.model_name_or_path)
                 CFG.tokenizer = tokenizer
+                model = load_model(CFG.model_name_or_path, device.type, task)
                 status.update(label="Model ready.", state="complete")
             except Exception as e:
                 st.session_state["last_error"] = f"Failed to load model: {e}"
             drop_last=False,
         )
+        if task == "yield prediction":
+            # Use custom inference function for yield prediction
+            prediction = inference_fn(dataloader, model, CFG)
+            output_df = input_df.copy()
+            output_df["prediction"] = prediction
+            output_df["prediction"] = output_df["prediction"].clip(lower=0.0, upper=100.0)
             st.session_state["results_df"] = output_df
             st.success("Prediction complete.")
+        else:
+            # Generation loop with progress
+            all_sequences, all_scores = [], []
+            total = len(dataloader)
+            progress = st.progress(0, text="Generating predictions...")
+            info_placeholder = st.empty()
+            for i, inputs in enumerate(dataloader, start=1):
+                inputs = {k: v.to(device) for k, v in inputs.items()}
+                with torch.no_grad():
+                    output = model.generate(
+                        **inputs,
+                        min_length=CFG.output_min_length,
+                        max_length=CFG.output_max_length,
+                        num_beams=CFG.num_beams,
+                        num_return_sequences=CFG.num_return_sequences,
+                        return_dict_in_generate=True,
+                        output_scores=True,
+                    )
+                sequences, scores = decode_output(output, CFG)
+                all_sequences.extend(sequences)
+                if scores:
+                    all_scores.extend(scores)
+                del output
+                if device.type == "cuda":
+                    torch.cuda.empty_cache()
+                gc.collect()
+                progress.progress(i / total, text=f"Generating predictions... {i}/{total}")
+                info_placeholder.caption(f"Processed batch {i} of {total}")
+            progress.empty()
+            info_placeholder.empty()
+            # Save predictions
+            try:
+                output_df = save_multiple_predictions(
+                    input_df, all_sequences, all_scores, CFG
+                )
+                st.session_state["results_df"] = output_df
+                st.success("Prediction complete.")
+            except Exception as e:
+                st.session_state["last_error"] = f"Failed to assemble output: {e}"
+                st.error(st.session_state["last_error"])
+                st.stop()
 # ------------------------------
 # Results