Spaces:

ml-jku
/

tox21_rf_classifier

Sleeping

App Files Files Community

antoniaebner commited on Sep 3

Commit

81226cb

1 Parent(s): 7f6d1d6

add RF framework

Browse files

Files changed (6) hide show

data.py +158 -0
model.py +60 -0
predict.py +42 -0
requirements.txt +7 -0
train.py +79 -0
utils.py +441 -0

data.py ADDED Viewed

	@@ -0,0 +1,158 @@

+# pipeline taken from https://huggingface.co/spaces/ml-jku/mhnfs/blob/main/src/data_preprocessing/create_descriptors.py
+"""
+This files includes a the data processing for Tox21.
+As an input it takes a list of SMILES and it outputs a nested dictionary with
+SMILES and target names as keys.
+"""
+import os
+from typing import List
+import numpy as np
+from sklearn.preprocessing import StandardScaler
+from statsmodels.distributions.empirical_distribution import ECDF
+from rdkit import Chem, DataStructs
+from rdkit.Chem import Descriptors, rdFingerprintGenerator
+from rdkit.Chem.rdchem import Mol
+from utils import USED_200_DESCR, Standardizer, load_pickle, write_pickle
+def preprocess_molecules(
+    smiles_list: list[str],
+    load_ecdf_path: str = "",
+    load_scaler_path: str = "",
+    save_ecdf_path: str = "",
+    save_scaler_path: str = "",
+) -> list[int]:
+    """preprocess a list of molecules"""
+    assert not (
+        load_ecdf_path and save_ecdf_path
+    ), "Cannot pass 'load_ecdf_path' and 'save_ecdf_path' simultaneously"
+    assert not (
+        load_scaler_path and save_scaler_path
+    ), "Cannot pass 'load_scaler_path' and 'save_scaler_path' simultaneously"
+    ecdfs = (
+        load_pickle(load_ecdf_path)
+        if load_ecdf_path and os.path.exists(load_ecdf_path)
+        else None
+    )
+    scaler = (
+        load_pickle(load_scaler_path)
+        if load_scaler_path and os.path.exists(load_scaler_path)
+        else None
+    )
+    # Create cleanded rdkit mol objects
+    mols, removed_idxs = create_cleaned_mol_objects(smiles_list)
+    print("Cleaned molecules")
+    # Create fingerprints and descriptors
+    ecfps = create_ecfp_fps(mols)
+    print("Created ECFP fingerprints")
+    rdkit_descrs = create_rdkit_descriptors(mols)
+    print("Created RDKit descriptors")
+    # Create and save ecdfs
+    if ecdfs is None:
+        print("Create ECDFs")
+        ecdfs = []
+        for column in range(rdkit_descrs.shape[1]):
+            raw_values = rdkit_descrs[:, column].reshape(-1)
+            ecdfs.append(ECDF(raw_values))
+        if save_ecdf_path:
+            write_pickle(save_ecdf_path, ecdfs)
+            print(f"Saved ECDFs under {save_ecdf_path}")
+    # Create quantils
+    rdkit_descr_quantils = create_quantils(rdkit_descrs, ecdfs)
+    print("Created quantiles of RDKit descriptors")
+    # Concatenate features
+    raw_features = np.concatenate((ecfps, rdkit_descr_quantils), axis=1)
+    if scaler is None:
+        scaler = StandardScaler()
+        scaler.fit(raw_features)
+        print("Fitted the StandardScaler")
+        if save_scaler_path:
+            write_pickle(save_scaler_path, scaler)
+            print(f"Saved the StandardScaler under {save_scaler_path}")
+    # Normalize feature vectors
+    normalized_features = scaler.transform(raw_features)
+    print("Normalized the molecule features")
+    return normalized_features, removed_idxs
+def create_cleaned_mol_objects(smiles: List[str]) -> List[Mol]:
+    """
+    This function creates cleaned RDKit mol objects from a list of SMILES.
+    """
+    sm = Standardizer(canon_taut=True)
+    removed_idxs = list()
+    mols = list()
+    for i, smile in enumerate(smiles):
+        mol = Chem.MolFromSmiles(smile)
+        standardized_mol, _ = sm.standardize_mol(mol)
+        if standardized_mol is None:
+            removed_idxs.append(i)
+            continue
+        can_mol = Chem.MolFromSmiles(Chem.MolToSmiles(standardized_mol))
+        mols.append(can_mol)
+    return mols, removed_idxs
+def create_ecfp_fps(mols: List[Mol]) -> np.ndarray:
+    """
+    This function ECFP fingerprints for a list of molecules.
+    """
+    ecfps = list()
+    for mol in mols:
+        fp_sparse_vec = rdFingerprintGenerator.GetCountFPs(
+            [mol], fpType=rdFingerprintGenerator.MorganFP
+        )[0]
+        fp = np.zeros((0,), np.int8)
+        DataStructs.ConvertToNumpyArray(fp_sparse_vec, fp)
+        ecfps.append(fp)
+    return np.array(ecfps)
+def create_rdkit_descriptors(mols: List[Mol]) -> np.ndarray:
+    """
+    This function creates RDKit descriptors for a list of molecules.
+    """
+    rdkit_descriptors = list()
+    for mol in mols:
+        descrs = []
+        for _, descr_calc_fn in Descriptors._descList:
+            descrs.append(descr_calc_fn(mol))
+        descrs = np.array(descrs)
+        descrs = descrs[USED_200_DESCR]
+        rdkit_descriptors.append(descrs)
+    return np.array(rdkit_descriptors)
+def create_quantils(raw_features: np.ndarray, ecdfs: list) -> np.ndarray:
+    quantils = np.zeros_like(raw_features)
+    for column in range(raw_features.shape[1]):
+        raw_values = raw_features[:, column].reshape(-1)
+        ecdf = ecdfs[column]
+        q = ecdf(raw_values)
+        quantils[:, column] = q
+    return quantils

model.py ADDED Viewed

	@@ -0,0 +1,60 @@

+"""
+This files includes a RF model for Tox21.
+As an input it takes a list of SMILES and it outputs a nested dictionary with
+SMILES and target names as keys.
+"""
+# ---------------------------------------------------------------------------------------
+# Dependencies
+import os
+import joblib
+import numpy as np
+from sklearn.ensemble import RandomForestClassifier
+from utils import TASKS
+# ---------------------------------------------------------------------------------------
+class Tox21RFClassifier:
+    """
+    A random forest classifier that assigns a toxicity score to a given SMILES string.
+    """
+    def __init__(self, seed: int = 42):
+        self.tasks = TASKS
+        self.model = {
+            task: RandomForestClassifier(n_estimators=1001, random_state=seed)
+            for task in self.tasks
+        }
+    def load_model(self, folder: str):
+        """
+        Loads the model from a given model checkpoint
+        """
+        self.model = {
+            task: joblib.load(os.path.join(folder, f"rf_{task}.joblib"))
+            for task in self.tasks
+        }
+    def save_model(self, folder: str):
+        """
+        Saves the model to a given folder
+        """
+        if not os.path.exists(folder):
+            os.makedirs(folder)
+        for task, model in self.model.items():
+            joblib.dump(model, os.path.join(folder, f"rf_{task}.joblib"))
+    def fit(self, task: str, input_features: np.ndarray, labels: np.ndarray) -> None:
+        assert task in self.tasks, f"Unknown task: {task}"
+        self.model[task].fit(input_features, labels)
+    def predict(self, task: str, features: np.ndarray) -> dict:
+        """
+        Predicts a given Tox21 targets for a given np.array of molecule features
+        """
+        assert task in self.tasks, f"Unknown task: {task}"
+        preds = self.model[task].predict_proba(features)
+        return preds[:, 1]

predict.py ADDED Viewed

	@@ -0,0 +1,42 @@

+"""
+This files includes a predict function for the Tox21.
+As an input it takes a list of SMILES and it outputs a nested dictionary with
+SMILES and target names as keys.
+"""
+# ---------------------------------------------------------------------------------------
+# Dependencies
+from typing import List
+from collections import defaultdict
+from data import preprocess_molecules
+from model import Tox21RFClassifier
+# ---------------------------------------------------------------------------------------
+def predict(smiles_list: List[str]) -> dict:
+    """
+    Applies the classifier to a list of SMILES strings.
+    """
+    # preprocessing pipeline
+    features, removed_idxs = preprocess_molecules(
+        smiles_list,
+        load_ecdf_path="assets/ecdfs.pkl",
+        load_scaler_path="assets/scaler.pkl",
+    )
+    # setup model
+    model = Tox21RFClassifier(seed=42)
+    model.load_model("assets/model/")
+    # make predicitons
+    predictions = defaultdict(dict)
+    for i, smiles in enumerate(smiles_list):
+        for target in model.tasks:
+            predictions[smiles][target] = (
+                0.0 if i in removed_idxs else model.predict(target, features[i])
+            )
+    return predictions

requirements.txt CHANGED Viewed

@@ -1,2 +1,9 @@
 fastapi
 uvicorn[standard]

 fastapi
 uvicorn[standard]
+statsmodels
+rdkit
+numpy
+scikit-learn
+joblib
+tabulate
+datasets

train.py ADDED Viewed

	@@ -0,0 +1,79 @@

+"""
+Script for fitting and saving any preprocessing assets, as well as the fitted RandomForest model
+"""
+import numpy as np
+from tabulate import tabulate
+from datasets import load_dataset
+from sklearn.metrics import roc_auc_score
+from data import preprocess_molecules
+from model import Tox21RFClassifier
+from utils import HF_TOKEN
+def get_sample_mask(removed_idxs: list[int], labels: np.ndarray):
+    # mask out NaN labels and labels of removed idxs
+    task_mask = ~np.isnan(labels)
+    removed_mask = np.ones_like(labels, dtype=bool)
+    removed_mask[removed_idxs] = 0
+    feature_mask = task_mask[removed_mask]
+    label_mask = np.logical_and(task_mask, removed_mask)
+    return feature_mask, label_mask
+def main():
+    # save preprocessing scaler and ecdf distributions
+    save_folder = "assets/model/"
+    ds = load_dataset("tschouis/tox21", token=HF_TOKEN)
+    print("Preprocess train molecules")
+    train_smiles = list(ds["train"]["smiles"])
+    train_features, train_removed_idxs = preprocess_molecules(
+        train_smiles,
+        save_ecdf_path="assets/ecdfs.pkl",
+        save_scaler_path="assets/scaler.pkl",
+    )
+    print("Preprocess validation molecules")
+    val_smiles = list(ds["validation"]["smiles"])
+    val_features, val_removed_idxs = preprocess_molecules(
+        val_smiles,
+        load_ecdf_path="assets/ecdfs.pkl",
+        load_scaler_path="assets/scaler.pkl",
+    )
+    model = Tox21RFClassifier(seed=42)
+    print("Start training.")
+    for task in model.tasks:
+        task_labels = ds["train"].to_pandas()[task].to_numpy()
+        feature_mask, label_mask = get_sample_mask(train_removed_idxs, task_labels)
+        print(f"Fit task {task} using {sum(label_mask)} samples")
+        model.fit(
+            task, train_features[feature_mask], task_labels[label_mask].astype(int)
+        )
+    print(f"Save model under {save_folder}")
+    # model.save_model(save_folder)
+    print("Evaluate model")
+    results = {}
+    for task in model.tasks:
+        task_labels = ds["validation"].to_pandas()[task].to_numpy()
+        feature_mask, label_mask = get_sample_mask(val_removed_idxs, task_labels)
+        pred = model.predict(task, val_features[feature_mask])
+        results[task] = [
+            roc_auc_score(y_true=task_labels[label_mask].astype(int), y_score=pred)
+        ]
+    print("Results:")
+    print(tabulate(results, headers="keys"))
+if __name__ == "__main__":
+    main()

utils.py ADDED Viewed

	@@ -0,0 +1,441 @@

+## These MolStandardizer classes are due to Paolo Tosco
+## It was taken from the FS-Mol github
+## (https://github.com/microsoft/FS-Mol/blob/main/fs_mol/preprocessing/utils/
+##  standardizer.py)
+## They ensure that a sequence of standardization operations are applied
+## https://gist.github.com/ptosco/7e6b9ab9cc3e44ba0919060beaed198e
+import os
+import pickle
+from rdkit import Chem
+from rdkit.Chem.MolStandardize import rdMolStandardize
+HF_TOKEN = os.environ.get("HF_TOKEN")
+TASKS = [
+    "NR-AR",
+    "NR-AR-LBD",
+    "NR-AhR",
+    "NR-Aromatase",
+    "NR-ER",
+    "NR-ER-LBD",
+    "NR-PPAR-gamma",
+    "SR-ARE",
+    "SR-ATAD5",
+    "SR-HSE",
+    "SR-MMP",
+    "SR-p53",
+]
+USED_200_DESCR = [
+    0,
+    1,
+    2,
+    3,
+    4,
+    5,
+    6,
+    7,
+    8,
+    9,
+    10,
+    11,
+    12,
+    13,
+    14,
+    15,
+    16,
+    25,
+    26,
+    27,
+    28,
+    29,
+    30,
+    31,
+    32,
+    33,
+    34,
+    35,
+    36,
+    37,
+    38,
+    39,
+    40,
+    41,
+    42,
+    43,
+    44,
+    45,
+    46,
+    47,
+    48,
+    49,
+    50,
+    51,
+    52,
+    53,
+    54,
+    55,
+    56,
+    57,
+    58,
+    59,
+    60,
+    61,
+    62,
+    63,
+    64,
+    65,
+    66,
+    67,
+    68,
+    69,
+    70,
+    71,
+    72,
+    73,
+    74,
+    75,
+    76,
+    77,
+    78,
+    79,
+    80,
+    81,
+    82,
+    83,
+    84,
+    85,
+    86,
+    87,
+    88,
+    89,
+    90,
+    91,
+    92,
+    93,
+    94,
+    95,
+    96,
+    97,
+    98,
+    99,
+    100,
+    101,
+    102,
+    103,
+    104,
+    105,
+    106,
+    107,
+    108,
+    109,
+    110,
+    111,
+    112,
+    113,
+    114,
+    115,
+    116,
+    117,
+    118,
+    119,
+    120,
+    121,
+    122,
+    123,
+    124,
+    125,
+    126,
+    127,
+    128,
+    129,
+    130,
+    131,
+    132,
+    133,
+    134,
+    135,
+    136,
+    137,
+    138,
+    139,
+    140,
+    141,
+    142,
+    143,
+    144,
+    145,
+    146,
+    147,
+    148,
+    149,
+    150,
+    151,
+    152,
+    153,
+    154,
+    155,
+    156,
+    157,
+    158,
+    159,
+    160,
+    161,
+    162,
+    163,
+    164,
+    165,
+    166,
+    167,
+    168,
+    169,
+    170,
+    171,
+    172,
+    173,
+    174,
+    175,
+    176,
+    177,
+    178,
+    179,
+    180,
+    181,
+    182,
+    183,
+    184,
+    185,
+    186,
+    187,
+    188,
+    189,
+    190,
+    191,
+    192,
+    193,
+    194,
+    195,
+    196,
+    197,
+    198,
+    199,
+    200,
+    201,
+    202,
+    203,
+    204,
+    205,
+    206,
+    207,
+]
+class Standardizer:
+    """
+    Simple wrapper class around rdkit Standardizer.
+    """
+    DEFAULT_CANON_TAUT = False
+    DEFAULT_METAL_DISCONNECT = False
+    MAX_TAUTOMERS = 100
+    MAX_TRANSFORMS = 100
+    MAX_RESTARTS = 200
+    PREFER_ORGANIC = True
+    def __init__(
+        self,
+        metal_disconnect=None,
+        canon_taut=None,
+    ):
+        """
+        Constructor.
+        All parameters are optional.
+        :param metal_disconnect:    if True, metallorganic complexes are
+                                    disconnected
+        :param canon_taut:          if True, molecules are converted to their
+                                    canonical tautomer
+        """
+        super().__init__()
+        if metal_disconnect is None:
+            metal_disconnect = self.DEFAULT_METAL_DISCONNECT
+        if canon_taut is None:
+            canon_taut = self.DEFAULT_CANON_TAUT
+        self._canon_taut = canon_taut
+        self._metal_disconnect = metal_disconnect
+        self._taut_enumerator = None
+        self._uncharger = None
+        self._lfrag_chooser = None
+        self._metal_disconnector = None
+        self._normalizer = None
+        self._reionizer = None
+        self._params = None
+    @property
+    def params(self):
+        """Return the MolStandardize CleanupParameters."""
+        if self._params is None:
+            self._params = rdMolStandardize.CleanupParameters()
+            self._params.maxTautomers = self.MAX_TAUTOMERS
+            self._params.maxTransforms = self.MAX_TRANSFORMS
+            self._params.maxRestarts = self.MAX_RESTARTS
+            self._params.preferOrganic = self.PREFER_ORGANIC
+            self._params.tautomerRemoveSp3Stereo = False
+        return self._params
+    @property
+    def canon_taut(self):
+        """Return whether tautomer canonicalization will be done."""
+        return self._canon_taut
+    @property
+    def metal_disconnect(self):
+        """Return whether metallorganic complexes will be disconnected."""
+        return self._metal_disconnect
+    @property
+    def taut_enumerator(self):
+        """Return the TautomerEnumerator object."""
+        if self._taut_enumerator is None:
+            self._taut_enumerator = rdMolStandardize.TautomerEnumerator(self.params)
+        return self._taut_enumerator
+    @property
+    def uncharger(self):
+        """Return the Uncharger object."""
+        if self._uncharger is None:
+            self._uncharger = rdMolStandardize.Uncharger()
+        return self._uncharger
+    @property
+    def lfrag_chooser(self):
+        """Return the LargestFragmentChooser object."""
+        if self._lfrag_chooser is None:
+            self._lfrag_chooser = rdMolStandardize.LargestFragmentChooser(
+                self.params.preferOrganic
+            )
+        return self._lfrag_chooser
+    @property
+    def metal_disconnector(self):
+        """Return the MetalDisconnector object."""
+        if self._metal_disconnector is None:
+            self._metal_disconnector = rdMolStandardize.MetalDisconnector()
+        return self._metal_disconnector
+    @property
+    def normalizer(self):
+        """Return the Normalizer object."""
+        if self._normalizer is None:
+            self._normalizer = rdMolStandardize.Normalizer(
+                self.params.normalizationsFile, self.params.maxRestarts
+            )
+        return self._normalizer
+    @property
+    def reionizer(self):
+        """Return the Reionizer object."""
+        if self._reionizer is None:
+            self._reionizer = rdMolStandardize.Reionizer(self.params.acidbaseFile)
+        return self._reionizer
+    def charge_parent(self, mol_in):
+        """Sequentially apply a series of MolStandardize operations:
+        * MetalDisconnector
+        * Normalizer
+        * Reionizer
+        * LargestFragmentChooser
+        * Uncharger
+        The net result is that a desalted, normalized, neutral
+        molecule with implicit Hs is returned.
+        """
+        params = Chem.RemoveHsParameters()
+        params.removeAndTrackIsotopes = True
+        mol_in = Chem.RemoveHs(mol_in, params, sanitize=False)
+        if self._metal_disconnect:
+            mol_in = self.metal_disconnector.Disconnect(mol_in)
+        normalized = self.normalizer.normalize(mol_in)
+        Chem.SanitizeMol(normalized)
+        normalized = self.reionizer.reionize(normalized)
+        Chem.AssignStereochemistry(normalized)
+        normalized = self.lfrag_chooser.choose(normalized)
+        normalized = self.uncharger.uncharge(normalized)
+        # need this to reassess aromaticity on things like
+        # cyclopentadienyl, tropylium, azolium, etc.
+        Chem.SanitizeMol(normalized)
+        return Chem.RemoveHs(Chem.AddHs(normalized))
+    def standardize_mol(self, mol_in):
+        """
+        Standardize a single molecule.
+        :param mol_in:  a Chem.Mol
+        :return:        * (standardized Chem.Mol, n_taut) tuple
+                          if success. n_taut will be negative if
+                          tautomer enumeration was aborted due
+                          to reaching a limit
+                        * (None, error_msg) if failure
+        This calls self.charge_parent() and, if self._canon_taut
+        is True, runs tautomer canonicalization.
+        """
+        n_tautomers = 0
+        if isinstance(mol_in, Chem.Mol):
+            name = None
+            try:
+                name = mol_in.GetProp("_Name")
+            except KeyError:
+                pass
+            if not name:
+                name = "NONAME"
+        else:
+            error = f"Expected SMILES or Chem.Mol as input, got {str(type(mol_in))}"
+            return None, error
+        try:
+            mol_out = self.charge_parent(mol_in)
+        except Exception as e:
+            error = f"charge_parent FAILED: {str(e).strip()}"
+            return None, error
+        if self._canon_taut:
+            try:
+                res = self.taut_enumerator.Enumerate(mol_out, False)
+            except TypeError:
+                # we are still on the pre-2021 RDKit API
+                res = self.taut_enumerator.Enumerate(mol_out)
+            except Exception as e:
+                # something else went wrong
+                error = f"canon_taut FAILED: {str(e).strip()}"
+                return None, error
+            n_tautomers = len(res)
+            if hasattr(res, "status"):
+                completed = (
+                    res.status == rdMolStandardize.TautomerEnumeratorStatus.Completed
+                )
+            else:
+                # we are still on the pre-2021 RDKit API
+                completed = len(res) < 1000
+            if not completed:
+                n_tautomers = -n_tautomers
+            try:
+                mol_out = self.taut_enumerator.PickCanonical(res)
+            except AttributeError:
+                # we are still on the pre-2021 RDKit API
+                mol_out = max(
+                    [(self.taut_enumerator.ScoreTautomer(m), m) for m in res]
+                )[1]
+            except Exception as e:
+                # something else went wrong
+                error = f"canon_taut FAILED: {str(e).strip()}"
+                return None, error
+        mol_out.SetProp("_Name", name)
+        return mol_out, n_tautomers
+def load_pickle(path: str):
+    with open(path, "rb") as file:
+        content = pickle.load(file)
+    return content
+def write_pickle(path: str, obj: object):
+    with open(path, "wb") as file:
+        pickle.dump(obj, file)