Spaces:

ml-jku
/

tox21_rf_classifier

Sleeping

App Files Files Community

antoniaebner commited on Sep 4

Commit

a8d912f

1 Parent(s): 81226cb

add argparsing to train.py; add docstrings; adapt Tox21RFClassifier save and load functions

Browse files

Files changed (4) hide show

data.py +54 -19
model.py +36 -20
predict.py +10 -5
train.py +47 -12

data.py CHANGED Viewed

@@ -7,7 +7,6 @@ SMILES and target names as keys.
 """
 import os
-from typing import List
 import numpy as np
@@ -27,8 +26,21 @@ def preprocess_molecules(
     load_scaler_path: str = "",
     save_ecdf_path: str = "",
     save_scaler_path: str = "",
-) -> list[int]:
-    """preprocess a list of molecules"""
     assert not (
         load_ecdf_path and save_ecdf_path
     ), "Cannot pass 'load_ecdf_path' and 'save_ecdf_path' simultaneously"
@@ -68,12 +80,12 @@ def preprocess_molecules(
             write_pickle(save_ecdf_path, ecdfs)
             print(f"Saved ECDFs under {save_ecdf_path}")
-    # Create quantils
-    rdkit_descr_quantils = create_quantils(rdkit_descrs, ecdfs)
     print("Created quantiles of RDKit descriptors")
     # Concatenate features
-    raw_features = np.concatenate((ecfps, rdkit_descr_quantils), axis=1)
     if scaler is None:
         scaler = StandardScaler()
@@ -90,9 +102,14 @@ def preprocess_molecules(
     return normalized_features, removed_idxs
-def create_cleaned_mol_objects(smiles: List[str]) -> List[Mol]:
-    """
-    This function creates cleaned RDKit mol objects from a list of SMILES.
     """
     sm = Standardizer(canon_taut=True)
@@ -109,9 +126,14 @@ def create_cleaned_mol_objects(smiles: List[str]) -> List[Mol]:
     return mols, removed_idxs
-def create_ecfp_fps(mols: List[Mol]) -> np.ndarray:
-    """
-    This function ECFP fingerprints for a list of molecules.
     """
     ecfps = list()
@@ -127,9 +149,14 @@ def create_ecfp_fps(mols: List[Mol]) -> np.ndarray:
     return np.array(ecfps)
-def create_rdkit_descriptors(mols: List[Mol]) -> np.ndarray:
-    """
-    This function creates RDKit descriptors for a list of molecules.
     """
     rdkit_descriptors = list()
@@ -145,14 +172,22 @@ def create_rdkit_descriptors(mols: List[Mol]) -> np.ndarray:
     return np.array(rdkit_descriptors)
-def create_quantils(raw_features: np.ndarray, ecdfs: list) -> np.ndarray:
-    quantils = np.zeros_like(raw_features)
     for column in range(raw_features.shape[1]):
         raw_values = raw_features[:, column].reshape(-1)
         ecdf = ecdfs[column]
         q = ecdf(raw_values)
-        quantils[:, column] = q
-    return quantils

 """
 import os
 import numpy as np
     load_scaler_path: str = "",
     save_ecdf_path: str = "",
     save_scaler_path: str = "",
+) -> tuple[np.ndarray, list[int]]:
+    """Preprocessing pipeline for a list of molecules.
+    Args:
+        smiles_list (list[str]): list of SMILES
+        load_ecdf_path (str, optional): Path to load ECDFs from. Defaults to "".
+        load_scaler_path (str, optional): Path to load fitted StandardScaler from. Defaults to "".
+        save_ecdf_path (str, optional): Path to save calculated ECDFs. Defaults to "".
+        save_scaler_path (str, optional): Path to save fitted StandardScaler. Defaults to "".
+    Returns:
+        np.ndarray: normalized ECFPs fingerprints and RDKit descriptor quantiles
+        list[int]: list of removed indices of molecules that could not be cleaned
+    """
     assert not (
         load_ecdf_path and save_ecdf_path
     ), "Cannot pass 'load_ecdf_path' and 'save_ecdf_path' simultaneously"
             write_pickle(save_ecdf_path, ecdfs)
             print(f"Saved ECDFs under {save_ecdf_path}")
+    # Create quantiles
+    rdkit_descr_quantiles = create_quantiles(rdkit_descrs, ecdfs)
     print("Created quantiles of RDKit descriptors")
     # Concatenate features
+    raw_features = np.concatenate((ecfps, rdkit_descr_quantiles), axis=1)
     if scaler is None:
         scaler = StandardScaler()
     return normalized_features, removed_idxs
+def create_cleaned_mol_objects(smiles: list[str]) -> list[Mol]:
+    """This function creates cleaned RDKit mol objects from a list of SMILES.
+    Args:
+        smiles (list[str]): list of SMILES
+    Returns:
+        list[Mol]: list of cleaned molecules
     """
     sm = Standardizer(canon_taut=True)
     return mols, removed_idxs
+def create_ecfp_fps(mols: list[Mol]) -> np.ndarray:
+    """This function ECFP fingerprints for a list of molecules.
+    Args:
+        mols (list[Mol]): list of molecules
+    Returns:
+        np.ndarray: ECFP fingerprints of molecules
     """
     ecfps = list()
     return np.array(ecfps)
+def create_rdkit_descriptors(mols: list[Mol]) -> np.ndarray:
+    """This function creates RDKit descriptors for a list of molecules.
+    Args:
+        mols (list[Mol]): list of molecules
+    Returns:
+        np.ndarray: RDKit descriptors of molecules
     """
     rdkit_descriptors = list()
     return np.array(rdkit_descriptors)
+def create_quantiles(raw_features: np.ndarray, ecdfs: list) -> np.ndarray:
+    """Create quantile values for given features using the columns
+    Args:
+        raw_features (np.ndarray): values to put into quantiles
+        ecdfs (list): ECDFs to use
+    Returns:
+        np.ndarray: computed quantiles
+    """
+    quantiles = np.zeros_like(raw_features)
     for column in range(raw_features.shape[1]):
         raw_values = raw_features[:, column].reshape(-1)
         ecdf = ecdfs[column]
         q = ecdf(raw_values)
+        quantiles[:, column] = q
+    return quantiles

model.py CHANGED Viewed

@@ -17,43 +17,59 @@ from utils import TASKS
 # ---------------------------------------------------------------------------------------
 class Tox21RFClassifier:
-    """
-    A random forest classifier that assigns a toxicity score to a given SMILES string.
-    """
     def __init__(self, seed: int = 42):
         self.tasks = TASKS
         self.model = {
             task: RandomForestClassifier(n_estimators=1001, random_state=seed)
             for task in self.tasks
         }
-    def load_model(self, folder: str):
-        """
-        Loads the model from a given model checkpoint
-        """
-        self.model = {
-            task: joblib.load(os.path.join(folder, f"rf_{task}.joblib"))
-            for task in self.tasks
-        }
-    def save_model(self, folder: str):
         """
-        Saves the model to a given folder
         """
-        if not os.path.exists(folder):
-            os.makedirs(folder)
-        for task, model in self.model.items():
-            joblib.dump(model, os.path.join(folder, f"rf_{task}.joblib"))
     def fit(self, task: str, input_features: np.ndarray, labels: np.ndarray) -> None:
         assert task in self.tasks, f"Unknown task: {task}"
         self.model[task].fit(input_features, labels)
-    def predict(self, task: str, features: np.ndarray) -> dict:
-        """
-        Predicts a given Tox21 targets for a given np.array of molecule features
         """
         assert task in self.tasks, f"Unknown task: {task}"
         preds = self.model[task].predict_proba(features)

 # ---------------------------------------------------------------------------------------
 class Tox21RFClassifier:
+    """A random forest classifier that assigns a toxicity score to a given SMILES string."""
     def __init__(self, seed: int = 42):
+        """Initialize a random forest classifier for each of the 12 Tox21 tasks.
+        Args:
+            seed (int, optional): seed for RF to ensure reproducibility. Defaults to 42.
+        """
         self.tasks = TASKS
         self.model = {
             task: RandomForestClassifier(n_estimators=1001, random_state=seed)
             for task in self.tasks
         }
+    def load_model(self, path: str) -> None:
+        """Loads the model from a given path
+        Args:
+            path (str): path to model checkpoint
         """
+        self.model = joblib.load(path)
+    def save_model(self, path: str) -> None:
+        """Saves the model to a given path
+        Args:
+            path (str): path to save model to
         """
+        if not os.path.exists(os.path.pardir(path)):
+            os.makedirs(os.path.pardir(path))
+        joblib.dump(self.model, path)
     def fit(self, task: str, input_features: np.ndarray, labels: np.ndarray) -> None:
+        """Train the random forest for a given task
+        Args:
+            task (str): task to train
+            input_features (np.ndarray): training features
+            labels (np.ndarray): training labels
+        """
         assert task in self.tasks, f"Unknown task: {task}"
         self.model[task].fit(input_features, labels)
+    def predict(self, task: str, features: np.ndarray) -> np.ndarray:
+        """Predicts labels for a given Tox21 target using molecule features
+        Args:
+            task (str): the Tox21 target to predict for
+            features (np.ndarray): molecule features used for prediction
+        Returns:
+            np.ndarray: predicted probability for positive class
         """
         assert task in self.tasks, f"Unknown task: {task}"
         preds = self.model[task].predict_proba(features)

predict.py CHANGED Viewed

@@ -6,7 +6,6 @@ SMILES and target names as keys.
 # ---------------------------------------------------------------------------------------
 # Dependencies
-from typing import List
 from collections import defaultdict
 from data import preprocess_molecules
@@ -15,9 +14,15 @@ from model import Tox21RFClassifier
 # ---------------------------------------------------------------------------------------
-def predict(smiles_list: List[str]) -> dict:
-    """
-    Applies the classifier to a list of SMILES strings.
     """
     # preprocessing pipeline
     features, removed_idxs = preprocess_molecules(
@@ -28,7 +33,7 @@ def predict(smiles_list: List[str]) -> dict:
     # setup model
     model = Tox21RFClassifier(seed=42)
-    model.load_model("assets/model/")
     # make predicitons
     predictions = defaultdict(dict)

 # ---------------------------------------------------------------------------------------
 # Dependencies
 from collections import defaultdict
 from data import preprocess_molecules
 # ---------------------------------------------------------------------------------------
+def predict(smiles_list: list[str]) -> dict:
+    """Applies the classifier to a list of SMILES strings. Returns prediction=0.0 for
+    any molecule that could not be cleaned.
+    Args:
+        smiles_list (list[str]): list of SMILES strings
+    Returns:
+        dict: nested prediction dictionary, following {'<smiles>': {'<target>': <pred>}}
     """
     # preprocessing pipeline
     features, removed_idxs = preprocess_molecules(
     # setup model
     model = Tox21RFClassifier(seed=42)
+    model.load_model("assets/rf_alltasks.joblib")
     # make predicitons
     predictions = defaultdict(dict)

train.py CHANGED Viewed

@@ -2,6 +2,8 @@
 Script for fitting and saving any preprocessing assets, as well as the fitted RandomForest model
 """
 import numpy as np
 from tabulate import tabulate
@@ -12,9 +14,43 @@ from data import preprocess_molecules
 from model import Tox21RFClassifier
 from utils import HF_TOKEN
-def get_sample_mask(removed_idxs: list[int], labels: np.ndarray):
-    # mask out NaN labels and labels of removed idxs
     task_mask = ~np.isnan(labels)
     removed_mask = np.ones_like(labels, dtype=bool)
     removed_mask[removed_idxs] = 0
@@ -25,25 +61,23 @@ def get_sample_mask(removed_idxs: list[int], labels: np.ndarray):
     return feature_mask, label_mask
-def main():
-    # save preprocessing scaler and ecdf distributions
-    save_folder = "assets/model/"
     ds = load_dataset("tschouis/tox21", token=HF_TOKEN)
     print("Preprocess train molecules")
     train_smiles = list(ds["train"]["smiles"])
     train_features, train_removed_idxs = preprocess_molecules(
         train_smiles,
-        save_ecdf_path="assets/ecdfs.pkl",
-        save_scaler_path="assets/scaler.pkl",
     )
     print("Preprocess validation molecules")
     val_smiles = list(ds["validation"]["smiles"])
     val_features, val_removed_idxs = preprocess_molecules(
         val_smiles,
-        load_ecdf_path="assets/ecdfs.pkl",
-        load_scaler_path="assets/scaler.pkl",
     )
     model = Tox21RFClassifier(seed=42)
@@ -57,8 +91,8 @@ def main():
             task, train_features[feature_mask], task_labels[label_mask].astype(int)
         )
-    print(f"Save model under {save_folder}")
-    # model.save_model(save_folder)
     print("Evaluate model")
     results = {}
@@ -76,4 +110,5 @@ def main():
 if __name__ == "__main__":
-    main()

 Script for fitting and saving any preprocessing assets, as well as the fitted RandomForest model
 """
+import argparse
 import numpy as np
 from tabulate import tabulate
 from model import Tox21RFClassifier
 from utils import HF_TOKEN
+parser = argparse.ArgumentParser(description="RF Trainig script for Tox21 dataset")
+parser.add_argument(
+    "--save_path_model",
+    type=str,
+    default="assets/rf_alltasks.joblib",
+)
+parser.add_argument(
+    "--save_path_ecdfs",
+    type=str,
+    default="assets/ecdfs.pkl",
+)
+parser.add_argument(
+    "--save_path_scaler",
+    type=str,
+    default="assets/scaler.pkl",
+)
+def get_sample_mask(
+    removed_idxs: list[int], labels: np.ndarray
+) -> tuple[np.ndarray, np.ndarray]:
+    """Returns two masks, one for the samples and one for the labels.
+    Filters out any indices removed from the samples and any indices
+    where the label is NaN.
+    Args:
+        removed_idxs (list[int]): Indices that were removed from the samples
+        labels (np.ndarray): list of labels
+    Returns:
+        np.ndarray: Feature mask
+        np.ndarray: Label mask
+    """
     task_mask = ~np.isnan(labels)
     removed_mask = np.ones_like(labels, dtype=bool)
     removed_mask[removed_idxs] = 0
     return feature_mask, label_mask
+def main(args):
     ds = load_dataset("tschouis/tox21", token=HF_TOKEN)
     print("Preprocess train molecules")
     train_smiles = list(ds["train"]["smiles"])
     train_features, train_removed_idxs = preprocess_molecules(
         train_smiles,
+        save_ecdf_path=args.save_path_ecdfs,
+        save_scaler_path=args.save_path_scaler,
     )
     print("Preprocess validation molecules")
     val_smiles = list(ds["validation"]["smiles"])
     val_features, val_removed_idxs = preprocess_molecules(
         val_smiles,
+        load_ecdf_path=args.save_path_ecdfs,
+        load_scaler_path=args.save_path_scaler,
     )
     model = Tox21RFClassifier(seed=42)
             task, train_features[feature_mask], task_labels[label_mask].astype(int)
         )
+    print(f"Save model under {args.save_path_model}")
+    model.save_model(args.save_path_model)
     print("Evaluate model")
     results = {}
 if __name__ == "__main__":
+    args = parser.parse_args()
+    main(args)