Spaces:

ml-jku
/

tox21_rf_classifier

Sleeping

App Files Files Community

antoniaebner commited on Oct 7

Commit

593848b

1 Parent(s): 6eb59be

update pipeline

Browse files

Files changed (5) hide show

predict.py +25 -17
src/data.py +74 -170
src/model.py +17 -7
src/preprocess.py +405 -0
src/utils.py +2 -0

predict.py CHANGED Viewed

@@ -8,13 +8,14 @@ SMILES and target names as keys.
 # Dependencies
 from collections import defaultdict
-from src.data import preprocess_molecules
-from src.model import Tox21RFClassifier
 # ---------------------------------------------------------------------------------------
-def predict(smiles_list: list[str]) -> dict:
     """Applies the classifier to a list of SMILES strings. Returns prediction=0.0 for
     any molecule that could not be cleaned.
@@ -26,29 +27,36 @@ def predict(smiles_list: list[str]) -> dict:
     """
     print(f"Received {len(smiles_list)} SMILES strings")
     # preprocessing pipeline
-    features, removed_idxs = preprocess_molecules(
         smiles_list,
-        load_ecdf_path="assets/ecdfs.pkl",
-        load_scaler_path="assets/scaler.pkl",
     )
-    print(f"{len(removed_idxs)} molecules removed during cleaning")
     # setup model
     model = Tox21RFClassifier(seed=42)
-    model.load_model("assets/rf_alltasks.joblib")
     # make predicitons
     predictions = defaultdict(dict)
-    # make smiles list with same num_samples as features
-    clean_smiles = [smi for i, smi in enumerate(smiles_list) if i not in removed_idxs]
-    no_pred_smiles = [smi for i, smi in enumerate(smiles_list) if i in removed_idxs]
     for target in model.tasks:
         target_pred = model.predict(target, features)
-        for i, smiles in enumerate(clean_smiles):
-            predictions[smiles][target] = target_pred[i]
-        for smiles in no_pred_smiles:
-            predictions[smiles][target] = 0.0
     return predictions

 # Dependencies
 from collections import defaultdict
+from .data import create_descriptors
+from .utils import load_pickle, KNOWN_DESCR
+from .model import Tox21RFClassifier
 # ---------------------------------------------------------------------------------------
+def predict(smiles_list: list[str]) -> dict[str, dict[str, float]]:
     """Applies the classifier to a list of SMILES strings. Returns prediction=0.0 for
     any molecule that could not be cleaned.
     """
     print(f"Received {len(smiles_list)} SMILES strings")
     # preprocessing pipeline
+    ecdfs_path = "assets/ecdfs.pkl"
+    scaler_path = "assets/scaler.pkl"
+    ecdfs = load_pickle(ecdfs_path)
+    scaler = load_pickle(scaler_path)
+    print(f"Loaded ecdfs from {ecdfs_path}")
+    print(f"Loaded scaler from {scaler_path}")
+    descriptors = KNOWN_DESCR
+    features, mol_mask = create_descriptors(
         smiles_list,
+        ecdfs=ecdfs,
+        scaler=scaler,
+        descriptors=descriptors,
     )
+    print(f"Created descriptors {descriptors} for molecules.")
+    print(f"{len(mol_mask) - sum(mol_mask)} molecules removed during cleaning")
     # setup model
     model = Tox21RFClassifier(seed=42)
+    model_path = "assets/rf_alltasks.joblib"
+    model.load_model(model_path)
+    print(f"Loaded model from {model_path}")
     # make predicitons
     predictions = defaultdict(dict)
+    # create a list with same length as smiles_list to obtain indices for respective features
+    feat_indices = np.cumsum(mol_mask) - 1
     for target in model.tasks:
         target_pred = model.predict(target, features)
+        for smiles, is_clean, i in zip(smiles_list, mol_mask, feat_indices):
+            predictions[smiles][target] = float(target_pred[i]) if is_clean else 0.0
     return predictions

src/data.py CHANGED Viewed

@@ -7,8 +7,10 @@ SMILES and target names as keys.
 """
 import os
 import numpy as np
 from sklearn.preprocessing import StandardScaler
 from statsmodels.distributions.empirical_distribution import ECDF
@@ -17,177 +19,79 @@ from rdkit import Chem, DataStructs
 from rdkit.Chem import Descriptors, rdFingerprintGenerator
 from rdkit.Chem.rdchem import Mol
-from .utils import USED_200_DESCR, Standardizer, load_pickle, write_pickle
-def preprocess_molecules(
-    smiles_list: list[str],
-    load_ecdf_path: str = "",
-    load_scaler_path: str = "",
-    save_ecdf_path: str = "",
-    save_scaler_path: str = "",
-) -> tuple[np.ndarray, list[int]]:
-    """Preprocessing pipeline for a list of molecules.
-    Args:
-        smiles_list (list[str]): list of SMILES
-        load_ecdf_path (str, optional): Path to load ECDFs from. Defaults to "".
-        load_scaler_path (str, optional): Path to load fitted StandardScaler from. Defaults to "".
-        save_ecdf_path (str, optional): Path to save calculated ECDFs. Defaults to "".
-        save_scaler_path (str, optional): Path to save fitted StandardScaler. Defaults to "".
-    Returns:
-        np.ndarray: normalized ECFPs fingerprints and RDKit descriptor quantiles
-        list[int]: list of removed indices of molecules that could not be cleaned
-    """
-    assert not (
-        load_ecdf_path and save_ecdf_path
-    ), "Cannot pass 'load_ecdf_path' and 'save_ecdf_path' simultaneously"
-    assert not (
-        load_scaler_path and save_scaler_path
-    ), "Cannot pass 'load_scaler_path' and 'save_scaler_path' simultaneously"
-    ecdfs = (
-        load_pickle(load_ecdf_path)
-        if load_ecdf_path and os.path.exists(load_ecdf_path)
-        else None
     )
-    scaler = (
-        load_pickle(load_scaler_path)
-        if load_scaler_path and os.path.exists(load_scaler_path)
-        else None
-    )
-    # Create cleanded rdkit mol objects
-    mols, removed_idxs = create_cleaned_mol_objects(smiles_list)
-    print("Cleaned molecules")
-    # Create fingerprints and descriptors
-    ecfps = create_ecfp_fps(mols)
-    print("Created ECFP fingerprints")
-    rdkit_descrs = create_rdkit_descriptors(mols)
-    print("Created RDKit descriptors")
-    # Create and save ecdfs
-    if ecdfs is None:
-        print("Create ECDFs")
-        ecdfs = []
-        for column in range(rdkit_descrs.shape[1]):
-            raw_values = rdkit_descrs[:, column].reshape(-1)
-            ecdfs.append(ECDF(raw_values))
-        if save_ecdf_path:
-            write_pickle(save_ecdf_path, ecdfs)
-            print(f"Saved ECDFs under {save_ecdf_path}")
-    # Create quantiles
-    rdkit_descr_quantiles = create_quantiles(rdkit_descrs, ecdfs)
-    print("Created quantiles of RDKit descriptors")
-    # Concatenate features
-    raw_features = np.concatenate((ecfps, rdkit_descr_quantiles), axis=1)
-    if scaler is None:
-        scaler = StandardScaler()
-        scaler.fit(raw_features)
-        print("Fitted the StandardScaler")
-        if save_scaler_path:
-            write_pickle(save_scaler_path, scaler)
-            print(f"Saved the StandardScaler under {save_scaler_path}")
-    # Normalize feature vectors
-    normalized_features = scaler.transform(raw_features)
-    print("Normalized the molecule features")
-    return normalized_features, removed_idxs
-def create_cleaned_mol_objects(smiles: list[str]) -> list[Mol]:
-    """This function creates cleaned RDKit mol objects from a list of SMILES.
-    Args:
-        smiles (list[str]): list of SMILES
-    Returns:
-        list[Mol]: list of cleaned molecules
-    """
-    sm = Standardizer(canon_taut=True)
-    removed_idxs = list()
-    mols = list()
-    for i, smile in enumerate(smiles):
-        mol = Chem.MolFromSmiles(smile)
-        standardized_mol, _ = sm.standardize_mol(mol)
-        if standardized_mol is None:
-            removed_idxs.append(i)
-            continue
-        can_mol = Chem.MolFromSmiles(Chem.MolToSmiles(standardized_mol))
-        mols.append(can_mol)
-    return mols, removed_idxs
-def create_ecfp_fps(mols: list[Mol]) -> np.ndarray:
-    """This function ECFP fingerprints for a list of molecules.
-    Args:
-        mols (list[Mol]): list of molecules
-    Returns:
-        np.ndarray: ECFP fingerprints of molecules
-    """
-    ecfps = list()
-    for mol in mols:
-        fp_sparse_vec = rdFingerprintGenerator.GetCountFPs(
-            [mol], fpType=rdFingerprintGenerator.MorganFP
-        )[0]
-        fp = np.zeros((0,), np.int8)
-        DataStructs.ConvertToNumpyArray(fp_sparse_vec, fp)
-        ecfps.append(fp)
-    return np.array(ecfps)
-def create_rdkit_descriptors(mols: list[Mol]) -> np.ndarray:
-    """This function creates RDKit descriptors for a list of molecules.
-    Args:
-        mols (list[Mol]): list of molecules
-    Returns:
-        np.ndarray: RDKit descriptors of molecules
-    """
-    rdkit_descriptors = list()
-    for mol in mols:
-        descrs = []
-        for _, descr_calc_fn in Descriptors._descList:
-            descrs.append(descr_calc_fn(mol))
-        descrs = np.array(descrs)
-        descrs = descrs[USED_200_DESCR]
-        rdkit_descriptors.append(descrs)
-    return np.array(rdkit_descriptors)
-def create_quantiles(raw_features: np.ndarray, ecdfs: list) -> np.ndarray:
-    """Create quantile values for given features using the columns
-    Args:
-        raw_features (np.ndarray): values to put into quantiles
-        ecdfs (list): ECDFs to use
-    Returns:
-        np.ndarray: computed quantiles
-    """
-    quantiles = np.zeros_like(raw_features)
-    for column in range(raw_features.shape[1]):
-        raw_values = raw_features[:, column].reshape(-1)
-        ecdf = ecdfs[column]
-        q = ecdf(raw_values)
-        quantiles[:, column] = q
-    return quantiles

 """
 import os
+from typing import Iterable, Literal
 import numpy as np
+import torch
 from sklearn.preprocessing import StandardScaler
 from statsmodels.distributions.empirical_distribution import ECDF
 from rdkit.Chem import Descriptors, rdFingerprintGenerator
 from rdkit.Chem.rdchem import Mol
+from .utils import USED_200_DESCR, Standardizer, load_pickle, write_pickle, KNOWN_DESCR
+from .preprocess import normalize_features
+def get_descriptor_dataset(
+    data_path: str,
+    descriptors: Iterable[str] | Literal["all"],
+    scaler=None,
+    save_scaler_path: str = "data/scaler.pkl",
+    verbose=True,
+    normalize=True,
+):
+    if descriptors == "all":
+        descriptors = KNOWN_DESCR
+    assert isinstance(descriptors, Iterable), "Passed descriptors are not iterable!"
+    assert all(
+        [descr in KNOWN_DESCR for descr in descriptors]
+    ), f"Passed descriptors contains unknown descriptor types. Allowed descriptors: {KNOWN_DESCR}"
+    datafile = np.load(data_path)
+    if not isinstance(datafile, np.ndarray):
+        # concatenate all descriptors and normalize
+        data = np.concatenate([datafile[descr] for descr in descriptors], axis=1)
+        labels = datafile["labels"]
+    else:
+        print("NPY file passed, cannot select specific descriptors")
+        data, labels = datafile[:, :-12], datafile[:, -12:]
+    if normalize:
+        data, scaler = normalize_features(
+            data,
+            scaler=scaler,
+            save_scaler_path=save_scaler_path,
+            verbose=verbose,
+        )
+    # filter out unsanitized molecules
+    mask = ~np.isnan(data).any(axis=1)
+    data = data[mask]
+    labels = labels[mask]
+    assert data.shape[0] == labels.shape[0], (
+        f"Mismatch between data and labels: "
+        f"data has {data.shape[0]} samples, but labels has {labels.shape[0]} samples."
     )
+    return (data, labels, scaler)
+def get_torch_descriptor_dataset(
+    data_path: str,
+    descriptors: list[str],
+    scaler=None,
+    save_scaler_path: str = "data/scaler.pkl",
+    nan_to_num: int = -100,
+    verbose=True,
+    normalize=True,
+) -> torch.utils.data.TensorDataset:
+    data, labels, scaler = get_descriptor_dataset(
+        data_path,
+        descriptors,
+        scaler,
+        save_scaler_path,
+        verbose=verbose,
+        normalize=normalize,
+    )
+    labels = np.nan_to_num(labels, nan=nan_to_num)
+    dataset = torch.utils.data.TensorDataset(
+        torch.FloatTensor(data), torch.LongTensor(labels)
+    )
+    return dataset, scaler

src/model.py CHANGED Viewed

@@ -19,17 +19,27 @@ from .utils import TASKS
 class Tox21RFClassifier:
     """A random forest classifier that assigns a toxicity score to a given SMILES string."""
-    def __init__(self, seed: int = 42):
         """Initialize a random forest classifier for each of the 12 Tox21 tasks.
         Args:
             seed (int, optional): seed for RF to ensure reproducibility. Defaults to 42.
         """
         self.tasks = TASKS
-        self.model = {
-            task: RandomForestClassifier(n_estimators=1001, random_state=seed)
-            for task in self.tasks
-        }
     def load_model(self, path: str) -> None:
         """Loads the model from a given path
@@ -45,8 +55,8 @@ class Tox21RFClassifier:
         Args:
             path (str): path to save model to
         """
-        if not os.path.exists(os.path.pardir(path)):
-            os.makedirs(os.path.pardir(path))
         joblib.dump(self.model, path)

 class Tox21RFClassifier:
     """A random forest classifier that assigns a toxicity score to a given SMILES string."""
+    def __init__(self, seed: int = 42, task_config: dict = None):
         """Initialize a random forest classifier for each of the 12 Tox21 tasks.
         Args:
             seed (int, optional): seed for RF to ensure reproducibility. Defaults to 42.
         """
         self.tasks = TASKS
+        if task_config is None:
+            self.model = {
+                task: RandomForestClassifier(
+                    n_estimators=1000, random_state=seed, n_jobs=8
+                )
+                for task in self.tasks
+            }
+        else:
+            self.model = {
+                task: RandomForestClassifier(
+                    **task_config[task], random_state=seed, n_jobs=8
+                )
+                for task in self.tasks
+            }
     def load_model(self, path: str) -> None:
         """Loads the model from a given path
         Args:
             path (str): path to save model to
         """
+        if not os.path.exists(os.path.dirname(path)):
+            os.makedirs(os.path.dirname(path))
         joblib.dump(self.model, path)

src/preprocess.py ADDED Viewed

	@@ -0,0 +1,405 @@

+# pipeline taken from https://huggingface.co/spaces/ml-jku/mhnfs/blob/main/src/data_preprocessing/create_descriptors.py
+"""
+This files includes a the data processing for Tox21.
+As an input it takes a list of SMILES and it outputs a nested dictionary with
+SMILES and target names as keys.
+"""
+import os
+import argparse
+import json
+from typing import Iterable
+import numpy as np
+import pandas as pd
+from sklearn.preprocessing import StandardScaler
+from statsmodels.distributions.empirical_distribution import ECDF
+from datasets import load_dataset
+from rdkit import Chem, DataStructs
+from rdkit.Chem import Descriptors, rdFingerprintGenerator, MACCSkeys
+from rdkit.Chem.rdchem import Mol
+from .utils import (
+    TASKS,
+    KNOWN_DESCR,
+    HF_TOKEN,
+    USED_200_DESCR,
+    Standardizer,
+    load_pickle,
+    write_pickle,
+)
+parser = argparse.ArgumentParser(
+    description="Data preprocessing script for the Tox21 dataset"
+)
+parser.add_argument(
+    "--save_folder",
+    type=str,
+    default="data/",
+)
+parser.add_argument(
+    "--use_hf",
+    type=int,
+    default=0,
+)
+parser.add_argument(
+    "--path_ecdfs",
+    type=str,
+    default="data/ecdfs.pkl",
+)
+parser.add_argument(
+    "--tox_smarts_filepath",
+    type=str,
+    default="data/tox_smarts.json",
+)
+def create_cleaned_mol_objects(smiles: list[str]) -> tuple[list[Mol], np.ndarray]:
+    """This function creates cleaned RDKit mol objects from a list of SMILES.
+    Args:
+        smiles (list[str]): list of SMILES
+    Returns:
+        list[Mol]: list of cleaned molecules
+        np.ndarray[bool]: mask that contains False at index `i`, if molecule in `smiles` at
+            index `i` could not be cleaned and was removed.
+    """
+    sm = Standardizer(canon_taut=True)
+    clean_mol_mask = list()
+    mols = list()
+    for i, smile in enumerate(smiles):
+        mol = Chem.MolFromSmiles(smile)
+        standardized_mol, _ = sm.standardize_mol(mol)
+        is_cleaned = standardized_mol is not None
+        clean_mol_mask.append(is_cleaned)
+        if not is_cleaned:
+            continue
+        can_mol = Chem.MolFromSmiles(Chem.MolToSmiles(standardized_mol))
+        mols.append(can_mol)
+    return mols, np.array(clean_mol_mask)
+def create_ecfp_fps(mols: list[Mol]) -> np.ndarray:
+    """This function ECFP fingerprints for a list of molecules.
+    Args:
+        mols (list[Mol]): list of molecules
+    Returns:
+        np.ndarray: ECFP fingerprints of molecules
+    """
+    ecfps = list()
+    for mol in mols:
+        fp_sparse_vec = rdFingerprintGenerator.GetCountFPs(
+            [mol], fpType=rdFingerprintGenerator.MorganFP
+        )[0]
+        fp = np.zeros((0,), np.int8)
+        DataStructs.ConvertToNumpyArray(fp_sparse_vec, fp)
+        ecfps.append(fp)
+    return np.array(ecfps)
+def create_maccs_keys(mols: list[Mol]) -> np.ndarray:
+    maccs = [MACCSkeys.GenMACCSKeys(x) for x in mols]
+    return np.array(maccs)
+def get_tox_patterns(filepath: str):
+    """This calculates tox features defined in tox_smarts.json.
+    Args:
+        mols: A list of Mol
+        n_jobs: If >1 multiprocessing is used
+    """
+    # load patterns
+    with open(filepath) as f:
+        smarts_list = [s[1] for s in json.load(f)]
+    # Code does not work for this case
+    assert len([s for s in smarts_list if ("AND" in s) and ("OR" in s)]) == 0
+    # Chem.MolFromSmarts takes a long time so it pays of to parse all the smarts first
+    # and then use them for all molecules. This gives a huge speedup over existing code.
+    # a list of patterns, whether to negate the match result and how to join them to obtain one boolean value
+    all_patterns = []
+    for smarts in smarts_list:
+        patterns = []  # list of smarts-patterns
+        # value for each of the patterns above. Negates the values of the above later.
+        negations = []
+        if " AND " in smarts:
+            smarts = smarts.split(" AND ")
+            merge_any = False  # If an ' AND ' is found all 'subsmarts' have to match
+        else:
+            # If there is an ' OR ' present it's enough is any of the 'subsmarts' match.
+            # This also accumulates smarts where neither ' OR ' nor ' AND ' occur
+            smarts = smarts.split(" OR ")
+            merge_any = True
+        # for all subsmarts check if they are preceded by 'NOT '
+        for s in smarts:
+            neg = s.startswith("NOT ")
+            if neg:
+                s = s[4:]
+            patterns.append(Chem.MolFromSmarts(s))
+            negations.append(neg)
+        all_patterns.append((patterns, negations, merge_any))
+    return all_patterns
+def create_tox_features(mols: list[Mol], patterns: list) -> np.ndarray:
+    """Matches the tox patterns against a molecule. Returns a boolean array"""
+    tox_data = []
+    for mol in mols:
+        mol_features = []
+        for patts, negations, merge_any in patterns:
+            matches = [mol.HasSubstructMatch(p) for p in patts]
+            matches = [m != n for m, n in zip(matches, negations)]
+            if merge_any:
+                pres = any(matches)
+            else:
+                pres = all(matches)
+            mol_features.append(pres)
+        tox_data.append(np.array(mol_features))
+    return np.array(tox_data)
+def create_rdkit_descriptors(mols: list[Mol]) -> np.ndarray:
+    """This function creates RDKit descriptors for a list of molecules.
+    Args:
+        mols (list[Mol]): list of molecules
+    Returns:
+        np.ndarray: RDKit descriptors of molecules
+    """
+    rdkit_descriptors = list()
+    for mol in mols:
+        descrs = []
+        for _, descr_calc_fn in Descriptors._descList:
+            descrs.append(descr_calc_fn(mol))
+        descrs = np.array(descrs)
+        descrs = descrs[USED_200_DESCR]
+        rdkit_descriptors.append(descrs)
+    return np.array(rdkit_descriptors)
+def create_quantiles(raw_features: np.ndarray, ecdfs: list) -> np.ndarray:
+    """Create quantile values for given features using the columns
+    Args:
+        raw_features (np.ndarray): values to put into quantiles
+        ecdfs (list): ECDFs to use
+    Returns:
+        np.ndarray: computed quantiles
+    """
+    quantiles = np.zeros_like(raw_features)
+    for column in range(raw_features.shape[1]):
+        raw_values = raw_features[:, column].reshape(-1)
+        ecdf = ecdfs[column]
+        q = ecdf(raw_values)
+        quantiles[:, column] = q
+    return quantiles
+def fill(features, mask, value=np.nan):
+    n_mols = len(mask)
+    n_features = features.shape[1]
+    data = np.zeros(shape=(n_mols, n_features))
+    data.fill(value)
+    data[~mask] = features
+    return data
+def normalize_features(
+    raw_features,
+    scaler=None,
+    save_scaler_path: str = "",
+    verbose=True,
+):
+    if scaler is None:
+        scaler = StandardScaler()
+        scaler.fit(raw_features)
+        if verbose:
+            print("Fitted the StandardScaler")
+        if save_scaler_path:
+            write_pickle(save_scaler_path, scaler)
+            if verbose:
+                print(f"Saved the StandardScaler under {save_scaler_path}")
+    # Normalize feature vectors
+    normalized_features = scaler.transform(raw_features)
+    if verbose:
+        print("Normalized molecule features")
+    return normalized_features, scaler
+def create_descriptors(
+    smiles,
+    ecdfs=None,
+    scaler=None,
+    descriptors: Iterable = KNOWN_DESCR,
+):
+    # Create cleanded rdkit mol objects
+    mols, clean_mol_mask = create_cleaned_mol_objects(smiles)
+    print("Cleaned molecules")
+    features = []
+    if "ecfps" in descriptors:
+        # Create fingerprints and descriptors
+        ecfps = create_ecfp_fps(mols)
+        # expand using mol_mask
+        ecfps = fill(ecfps, ~clean_mol_mask)
+        features.append(ecfps)
+        print("Created ECFP fingerprints")
+    if "rdkit_descr_quantiles" in descriptors:
+        rdkit_descrs = create_rdkit_descriptors(mols)
+        print("Created RDKit descriptors")
+        # Create and save ecdfs
+        if ecdfs is None:
+            print("Create ECDFs")
+            ecdfs = []
+            for column in range(rdkit_descrs.shape[1]):
+                raw_values = rdkit_descrs[:, column].reshape(-1)
+                ecdfs.append(ECDF(raw_values))
+        # Create quantiles
+        rdkit_descr_quantiles = create_quantiles(rdkit_descrs, ecdfs)
+        # expand using mol_mask
+        rdkit_descr_quantiles = fill(rdkit_descr_quantiles, ~clean_mol_mask)
+        features.append(rdkit_descr_quantiles)
+        print("Created quantiles of RDKit descriptors")
+    if "maccs" in descriptors:
+        maccs = create_maccs_keys(mols)
+        maccs = fill(maccs, ~clean_mol_mask)
+        features.append(maccs)
+        print("Created MACCS keys")
+    if "tox" in descriptors:
+        tox_patterns = get_tox_patterns("assets/tox_smarts.json")
+        tox = create_tox_features(mols, tox_patterns)
+        tox = fill(tox, ~clean_mol_mask)
+        features.append(tox)
+        print("Created Tox features")
+    # concatenate features
+    raw_features = np.concatenate(features, axis=1)
+    # normalize with scaler if scaler is passed, else create scaler
+    features, _ = normalize_features(
+        raw_features,
+        scaler=scaler,
+        verbose=True,
+    )
+    return features, clean_mol_mask
+def main(args):
+    splits = ["train", "validation"]
+    ds = load_dataset("tschouis/tox21", token=HF_TOKEN)
+    for split in splits:
+        print(f"Preprocess {split} molecules")
+        smiles = list(ds[split]["smiles"])
+        # Create cleanded rdkit mol objects
+        mols, clean_mol_mask = create_cleaned_mol_objects(smiles)
+        print("Cleaned molecules")
+        tox_patterns = get_tox_patterns(args.tox_smarts_filepath)
+        # Create fingerprints and descriptors
+        ecfps = create_ecfp_fps(mols)
+        # expand using mol_mask
+        ecfps = fill(ecfps, ~clean_mol_mask)
+        print("Created ECFP fingerprints")
+        rdkit_descrs = create_rdkit_descriptors(mols)
+        print("Created RDKit descriptors")
+        # Create and save ecdfs
+        if split == "train":
+            print("Create ECDFs")
+            ecdfs = []
+            for column in range(rdkit_descrs.shape[1]):
+                raw_values = rdkit_descrs[:, column].reshape(-1)
+                ecdfs.append(ECDF(raw_values))
+            write_pickle(args.path_ecdfs, ecdfs)
+            print(f"Saved ECDFs under {args.path_ecdfs}")
+        else:
+            print(f"Load ECDFs from {args.path_ecdfs}")
+            ecdfs = load_pickle(args.path_ecdfs)
+        # Create quantiles
+        rdkit_descr_quantiles = create_quantiles(rdkit_descrs, ecdfs)
+        # expand using mol_mask
+        rdkit_descr_quantiles = fill(rdkit_descr_quantiles, ~clean_mol_mask)
+        print("Created quantiles of RDKit descriptors")
+        maccs = create_maccs_keys(mols)
+        maccs = fill(maccs, ~clean_mol_mask)
+        print("Created MACCS keys")
+        tox = create_tox_features(mols, tox_patterns)
+        tox = fill(tox, ~clean_mol_mask)
+        print("Created Tox features")
+        labels = []
+        for task in TASKS:
+            datasplit = ds[split].to_pandas() if args.use_hf else ds[split]
+            labels.append(datasplit[task].to_numpy())
+        labels = np.stack(labels, axis=1)
+        save_path = os.path.join(args.save_folder, f"tox21_{split}.npz")
+        with open(save_path, "wb") as f:
+            np.savez(
+                f,
+                labels=labels,
+                ecfps=ecfps,
+                rdkit_descr_quantiles=rdkit_descr_quantiles,
+                maccs=maccs,
+                tox=tox,
+            )
+            print(f"Saved preprocessed {split} split under {save_path}")
+    print("Preprocessing finished successfully")
+if __name__ == "__main__":
+    args = parser.parse_args()
+    if not os.path.exists(args.save_folder):
+        os.makedirs(args.save_folder)
+    if not os.path.exists(os.path.dirname(args.path_ecdfs)):
+        os.makedirs(os.path.dirname(args.path_ecdfs))
+    main(args)

src/utils.py CHANGED Viewed

@@ -28,6 +28,8 @@ TASKS = [
     "SR-p53",
 ]
 USED_200_DESCR = [
     0,
     1,

     "SR-p53",
 ]
+KNOWN_DESCR = ["ecfps", "rdkit_descr_quantiles", "maccs", "tox"]
 USED_200_DESCR = [
     0,
     1,