Spaces:

ml-jku
/

tox21_rf_classifier

Sleeping

App Files Files Community

antoniaebner commited on Nov 13

Commit

1994acc

1 Parent(s): 9b322e1

refactoring of feature preprocessing

Browse files

Files changed (7) hide show

config/config.json +27 -8
predict.py +27 -22
preprocess.py +23 -145
src/model.py +13 -87
src/preprocess.py +330 -99
src/utils.py +62 -2
train.py +55 -51

config/config.json CHANGED Viewed

@@ -1,14 +1,33 @@
 {
     "seed": 0,
-    "ecfp_radius": 3,
-    "ecfp_fpsize": 8192,
-    "feature_minvar": 0.01,
-    "feature_maxcorr": 0.95,
-    "model_path": "checkpoints/rf_alltasks.joblib",
-    "data_folder": "data/",
     "log_folder": "logs/",
-    "debug": "false",
-    "task_configs": {
         "NR-AR": {
             "max_depth": "none",
             "max_features": "sqrt",

 {
     "seed": 0,
+    "debug": "false",
+    "device": "cpu",
     "log_folder": "logs/",
+    "data_folder": "data/",
+    "cvfold": 4,
+    "ecfp" : {
+        "radius": 3,
+        "fpsize": 8192
+    },
+    "descriptors": ["ecfps", "tox", "maccs", "rdkit_descrs"],
+    "feature_selection": {
+        "use": "true",
+        "min_var": 0.01,
+        "max_corr": 0.95,
+        "feature_keys": ["ecfps", "tox", "maccs", "rdkit_descrs"],
+        "max_features": -1
+    },
+    "feature_quantilization": {
+        "use": "true",
+        "feature_keys": ["rdkit_descrs"]
+    },
+    "max_samples": -1,
+    "scaler": "standard",
+    "ckpt_path": "checkpoints/rf_alltasks.joblib",
+    "model_configs": {
         "NR-AR": {
             "max_depth": "none",
             "max_features": "sqrt",

predict.py CHANGED Viewed

@@ -6,15 +6,18 @@ SMILES and target names as keys.
 # ---------------------------------------------------------------------------------------
 # Dependencies
 from collections import defaultdict
-import json
 import numpy as np
 from tqdm import tqdm
-from src.preprocess import create_descriptors
-from src.utils import TASKS, normalize_config
 from src.model import Tox21RFClassifier
 # ---------------------------------------------------------------------------------------
 CONFIG_FILE = "./config/config.json"
@@ -35,20 +38,29 @@ def predict(
     print(f"Received {len(smiles_list)} SMILES strings")
     with open(CONFIG_FILE, "r") as f:
-        cfg = json.load(f)
-    cfg = normalize_config(cfg)
     features, is_clean = create_descriptors(
-        smiles_list, radius=cfg["ecfp_radius"], fpsize=cfg["ecfp_fpsize"]
     )
-    n_clean_mols, n_feats = features.shape
-    print(f"Created {n_feats} descriptors for {n_clean_mols} molecules.")
     print(f"{len(is_clean) - sum(is_clean)} molecules removed during cleaning")
     # setup model
     model = Tox21RFClassifier()
-    model.load_model(cfg["model_path"])
-    print(f"Loaded model from {cfg['model_path']}")
     # make predicitons
     predictions = defaultdict(dict)
@@ -56,24 +68,17 @@ def predict(
     print(f"Create predictions:")
     preds = []
     for target in tqdm(TASKS):
-        X = features.copy()
-        preds = np.empty_like(is_clean, dtype=np.float64)
         preds[~is_clean] = default_prediction
         preds[is_clean] = model.predict(target, X)
         for smiles, pred in zip(smiles_list, preds):
             predictions[smiles][target] = float(pred)
-        if cfg["debug"]:
             break
     return predictions
-# from hiddens.testing import test_eval
-# with open(CONFIG_FILE, "r") as f:
-#     config = json.load(f)
-# config = normalize_config(config)
-# test_eval(predict, debug=config["debug"], use_only_clean=False, use_only_first=False)

 # ---------------------------------------------------------------------------------------
 # Dependencies
+import json
+import copy
 from collections import defaultdict
+import joblib
 import numpy as np
 from tqdm import tqdm
 from src.model import Tox21RFClassifier
+from src.preprocess import create_descriptors, FeaturePreprocessor
+from src.utils import TASKS, normalize_config
 # ---------------------------------------------------------------------------------------
 CONFIG_FILE = "./config/config.json"
     print(f"Received {len(smiles_list)} SMILES strings")
     with open(CONFIG_FILE, "r") as f:
+        config = json.load(f)
+    config = normalize_config(config)
     features, is_clean = create_descriptors(
+        smiles_list, config["descriptors"], **config["ecfp"]
     )
+    print(f"Created descriptors for {sum(is_clean)} molecules.")
     print(f"{len(is_clean) - sum(is_clean)} molecules removed during cleaning")
     # setup model
     model = Tox21RFClassifier()
+    preprocessor = FeaturePreprocessor(
+        feature_selection_config=config["feature_selection"],
+        feature_quantilization_config=config["feature_quantilization"],
+        descriptors=config["descriptors"],
+        max_samples=config["max_samples"],
+        scaler=config["scaler"],
+    )
+    ckpt = joblib.load(config["ckpt_path"])
+    model.set_state(ckpt["models"])
+    preprocessor.__setstate__(ckpt["preprocessor"])
+    print(f"Loaded model & preprocessor from {config['ckpt_path']}")
     # make predicitons
     predictions = defaultdict(dict)
     print(f"Create predictions:")
     preds = []
     for target in tqdm(TASKS):
+        X = copy.deepcopy(features)
+        X = {descr: array[is_clean] for descr, array in X.items()}
+        X = preprocessor.transform(X)
+        preds = np.empty_like(is_clean, dtype=np.float64)
         preds[~is_clean] = default_prediction
         preds[is_clean] = model.predict(target, X)
         for smiles, pred in zip(smiles_list, preds):
             predictions[smiles][target] = float(pred)
+        if config["debug"]:
             break
     return predictions

preprocess.py CHANGED Viewed

@@ -7,186 +7,64 @@ SMILES and target names as keys.
 """
 import os
 import argparse
 import numpy as np
 from src.preprocess import create_descriptors, get_tox21_split
-from src.utils import (
-    TASKS,
-    HF_TOKEN,
-    create_dir,
-)
 parser = argparse.ArgumentParser(
     description="Data preprocessing script for the Tox21 dataset"
 )
 parser.add_argument(
-    "--save_folder",
-    type=str,
-    default="data/",
-    help="Folder to which preprocessed the data CSV and NPZ files should be saved.",
-)
-parser.add_argument(
-    "--cv_fold",
-    type=int,
-    default=4,
-    help="Select fold used as validation set.",
-)
-parser.add_argument(
-    "--feature_selection",
-    type=int,
-    default=1,
-    help="True (=1) to use feature selection.",
-)
-parser.add_argument(
-    "--feature_selection_path",
-    type=str,
-    default="feat_selection.npz",
-    help="Filename for saving feature selections.",
-)
-parser.add_argument(
-    "--min_var",
-    type=float,
-    default=0.01,
-    help="Minimum variance threshold for selecting features.",
-)
-parser.add_argument(
-    "--max_corr",
-    type=float,
-    default=0.95,
-    help="Maximum correlation threshold for selecting features.",
-)
-parser.add_argument(
-    "--ecdfs_path",
     type=str,
-    default="ecdfs.pkl",
-    help="Filename to save ECDFs.",
 )
-parser.add_argument(
-    "--ecfps_radius",
-    type=int,
-    default=3,
-    help="Radius used for creating ECFPs.",
-)
-parser.add_argument(
-    "--ecfps_folds",
-    type=int,
-    default=8192,
-    help="Folds used for creating ECFPs.",
-)
-parser.add_argument(
-    "--ecdfs",
-    type=int,
-    default=1,
-    help="True (=1) to use ECDFs for creating quantiles of the RDKit descriptors.",
-)
-def main(args):
-    """Preprocessing train/val data to use for TabPFN.
-    1. Download Tox21 train/val data from HF
-    2. Preprocess dataset splits
-    """
-    ds = get_tox21_split(HF_TOKEN, cvfold=args.cv_fold)
-    feature_creation_kwargs = {
-        "radius": args.ecfps_radius,
-        "fpsize": args.ecfps_folds,
-        "min_var": args.min_var,
-        "max_corr": args.max_corr,
-    }
-    removed_mols = 0
-    splits = ["train", "validation", "test"]
     for split in splits:
         print(f"Preprocess {split} molecules")
-        if split != "test":
-            ds_split = ds[split]
-            smiles = list(ds_split["smiles"])
-        else:
-            import pandas as pd
-            ds_split = pd.read_csv("data/tox21_test_cv4.csv")
-            smiles = ds_split["smiles"]
-        features, clean_mol_mask = create_descriptors(smiles, **feature_creation_kwargs)
-        # if split == "train":
-        #     output = create_descriptors(
-        #         smiles,
-        #         return_feature_selection=True,
-        #         return_ecdfs=True,
-        #         **feature_creation_kwargs,
-        #     )
-        #     features = output.pop("features")
-        #     if args.feature_selection:
-        #         feature_selection = output.pop("feature_selection")
-        #         np.savez(
-        #             args.feature_selection_path,
-        #             ecfps_selec=feature_selection["ecfps_selec"],
-        #             tox_selec=feature_selection["tox_selec"],
-        #         )
-        #         print(f"Saved feature selection under {args.feature_selection_path}")
-        #     if args.ecdfs:
-        #         ecdfs = output.pop("ecdfs")
-        #         write_pickle(args.ecdfs_path, ecdfs)
-        #         print(f"Saved ECDFs under {args.ecdfs_path}")
-        # else:
-        #     features = create_descriptors(
-        #         smiles,
-        #         ecdfs=ecdfs,
-        #         feature_selection=feature_selection,
-        #         **feature_creation_kwargs,
-        #     )["features"]
-        removed_mols += (~clean_mol_mask).sum()
         labels = []
         for task in TASKS:
             labels.append(ds_split[task].to_numpy())
         labels = np.stack(labels, axis=1)
-        save_path = os.path.join(args.save_folder, f"tox21_{split}_cv4.npz")
         with open(save_path, "wb") as f:
             np.savez(
                 f,
-                labels=labels[clean_mol_mask, :],
-                features=features,
-                # **features,
             )
             print(f"Saved preprocessed {split} split under {save_path}")
-    print(f"{removed_mols} mols were removed during cleaning across all datasets")
     print("Preprocessing finished successfully")
 if __name__ == "__main__":
     args = parser.parse_args()
-    # args.ecdfs_path = os.path.join(args.save_folder, args.ecdfs_path)
-    # args.feature_selection_path = os.path.join(
-    #     args.save_folder, args.feature_selection_path
-    # )
-    create_dir(args.save_folder)
-    # create_dir(args.ecdfs_path, is_file=True)
-    # create_dir(args.feature_selection_path, is_file=True)
-    main(args)

 """
 import os
+import json
 import argparse
 import numpy as np
 from src.preprocess import create_descriptors, get_tox21_split
+from src.utils import TASKS, HF_TOKEN, create_dir, normalize_config
 parser = argparse.ArgumentParser(
     description="Data preprocessing script for the Tox21 dataset"
 )
 parser.add_argument(
+    "--config",
     type=str,
+    default="config/config.json",
 )
+def main(config):
+    """Create molecule descriptors for HF Tox21 dataset"""
+    ds = get_tox21_split(HF_TOKEN, cvfold=config["cvfold"])
+    splits = ["train", "validation"]
     for split in splits:
         print(f"Preprocess {split} molecules")
+        ds_split = ds[split]
+        smiles = list(ds_split["smiles"])
+        features, clean_mol_mask = create_descriptors(
+            smiles, config["descriptors"], **config["ecfp"]
+        )
         labels = []
         for task in TASKS:
             labels.append(ds_split[task].to_numpy())
         labels = np.stack(labels, axis=1)
+        save_path = os.path.join(config["data_folder"], f"tox21_{split}_cv4.npz")
         with open(save_path, "wb") as f:
             np.savez(
                 f,
+                clean_mol_mask=clean_mol_mask,
+                labels=labels,
+                **features,
             )
             print(f"Saved preprocessed {split} split under {save_path}")
     print("Preprocessing finished successfully")
 if __name__ == "__main__":
     args = parser.parse_args()
+    with open(args.config, "r") as f:
+        config = json.load(f)
+    config = normalize_config(config)
+    create_dir(config["data_folder"])
+    main(config)

src/model.py CHANGED Viewed

@@ -6,15 +6,9 @@ SMILES and target names as keys.
 # ---------------------------------------------------------------------------------------
 # Dependencies
-import os
-import joblib
 import numpy as np
 from sklearn.ensemble import RandomForestClassifier
-from sklearn.preprocessing import StandardScaler
-from .preprocess import get_feature_selection, get_ecdfs, create_quantiles
 from .utils import TASKS
@@ -22,100 +16,34 @@ from .utils import TASKS
 class Tox21RFClassifier:
     """A random forest classifier that assigns a toxicity score to a given SMILES string."""
-    def __init__(
-        self, seed: int = 42, task_config: dict = None, rdkit_desc_idxs: list[int] = []
-    ):
         """Initialize a random forest classifier for each of the 12 Tox21 tasks.
         Args:
             seed (int, optional): seed for RF to ensure reproducibility. Defaults to 42.
         """
         self.tasks = TASKS
-        self.rdkit_desc_idxs = rdkit_desc_idxs
         self.models = {
             task: RandomForestClassifier(
                 random_state=seed,
                 n_jobs=8,
-                **(
-                    {"n_estimators": 1000} if task_config is None else task_config[task]
-                ),
             )
             for task in self.tasks
         }
-        self.feature_selection = None
-        self.ecdfs = None
-        self.scaler = StandardScaler()
-    def load_model(self, path: str) -> None:
-        """Loads the model from a given path
-        Args:
-            path (str): path to model checkpoint
-        """
-        model = joblib.load(path)
-        self.models = model["models"]
-        self.scaler = model["scalers"]
-        self.rdkit_desc_idxs = model["rdkit_desc_idxs"]
-        self.feature_selection = model["feature_selections"]
-        self.ecdfs = model["ecdfs"]
-    def save_model(self, path: str) -> None:
-        """Saves the model to a given path
         Args:
-            path (str): path to save model to
         """
-        if not os.path.exists(os.path.dirname(path)):
-            os.makedirs(os.path.dirname(path))
-        model = {
-            "models": self.models,
-            "feature_selections": self.feature_selection,
-            "ecdfs": self.ecdfs,
-            "scalers": self.scaler,
-            "rdkit_desc_idxs": self.rdkit_desc_idxs,
-        }
-        joblib.dump(model, path)
-    def fit_preprocessing(self, X: np.ndarray, min_var=0.01, max_corr=0.95) -> None:
-        X_ = X.copy()
-        _, n_feat = X.shape
-        if self.rdkit_desc_idxs is None:
-            self.rdkit_desc_idxs = np.arange(n_feat)
-        else:
-            assert (
-                self.rdkit_desc_idxs < n_feat
-            ).all(), "passed to_adapt list contains more features than in X!"
-        self.ecdfs = get_ecdfs(X_[:, self.rdkit_desc_idxs])
-        X_[:, self.rdkit_desc_idxs] = create_quantiles(
-            X_[:, self.rdkit_desc_idxs], self.ecdfs
-        )
-        # get feature selection
-        self.feature_selection = get_feature_selection(
-            X_, min_var=min_var, max_corr=max_corr
-        )
-        X_ = X_[:, self.feature_selection]
-        # fit scaler
-        X_ = self.scaler.fit(X_)
-    def _preprocess(self, X: np.ndarray) -> None:
-        X_ = X.copy()
-        X_[:, self.rdkit_desc_idxs] = create_quantiles(
-            X_[:, self.rdkit_desc_idxs], self.ecdfs
-        )
-        X_ = X_[:, self.feature_selection]
-        X_ = self.scaler.transform(X_)
-        return X_
     def fit(self, task: str, X: np.ndarray, y: np.ndarray) -> None:
         """Train the random forest for a given task
@@ -126,9 +54,8 @@ class Tox21RFClassifier:
             y (np.ndarray): training labels
         """
         assert task in self.tasks, f"Unknown task: {task}"
-        X_ = self._preprocess(X)
-        self.models[task].fit(X_, y)
     def predict(self, task: str, X: np.ndarray) -> np.ndarray:
         """Predicts labels for a given Tox21 target using molecule features
@@ -144,6 +71,5 @@ class Tox21RFClassifier:
         assert (
             len(X.shape) == 2
         ), f"Function expects 2D np.array. Current shape: {X.shape}"
-        X_ = self._preprocess(X)
-        return self.models[task].predict_proba(X_)[:, 1]

 # ---------------------------------------------------------------------------------------
 # Dependencies
 import numpy as np
 from sklearn.ensemble import RandomForestClassifier
 from .utils import TASKS
 class Tox21RFClassifier:
     """A random forest classifier that assigns a toxicity score to a given SMILES string."""
+    def __init__(self, seed: int = 42, config: dict = None):
         """Initialize a random forest classifier for each of the 12 Tox21 tasks.
         Args:
             seed (int, optional): seed for RF to ensure reproducibility. Defaults to 42.
         """
         self.tasks = TASKS
         self.models = {
             task: RandomForestClassifier(
                 random_state=seed,
                 n_jobs=8,
+                **({"n_estimators": 1000} if config is None else config[task]),
             )
             for task in self.tasks
         }
+    def set_state(self, state: dict) -> None:
+        """Sets the state of the model
         Args:
+            state (dict): models state dict
         """
+        self.models = state
+    def get_state(self) -> None:
+        """Return model state dict"""
+        return {"models": self.models}
     def fit(self, task: str, X: np.ndarray, y: np.ndarray) -> None:
         """Train the random forest for a given task
             y (np.ndarray): training labels
         """
         assert task in self.tasks, f"Unknown task: {task}"
+        _X, _y = X.copy(), y.copy()
+        self.models[task].fit(_X, _y)
     def predict(self, task: str, X: np.ndarray) -> np.ndarray:
         """Predicts labels for a given Tox21 target using molecule features
         assert (
             len(X.shape) == 2
         ), f"Function expects 2D np.array. Current shape: {X.shape}"
+        _X = X.copy()
+        return self.models[task].predict_proba(_X)[:, 1]

src/preprocess.py CHANGED Viewed

@@ -6,20 +6,304 @@ As an input it takes a list of SMILES and it outputs a nested dictionary with
 SMILES and target names as keys.
 """
 import json
 import numpy as np
 import pandas as pd
 from datasets import load_dataset
 from sklearn.feature_selection import VarianceThreshold
 from statsmodels.distributions.empirical_distribution import ECDF
 from rdkit import Chem, DataStructs
 from rdkit.Chem import Descriptors, rdFingerprintGenerator, MACCSkeys
 from rdkit.Chem.rdchem import Mol
-from .utils import USED_200_DESCR, TOX_SMARTS_PATH, Standardizer
 def create_cleaned_mol_objects(smiles: list[str]) -> tuple[list[Mol], np.ndarray]:
@@ -198,112 +482,59 @@ def fill(features, mask, value=np.nan):
 def create_descriptors(
     smiles,
-    ecdfs=None,
-    feature_selection=None,
-    return_ecdfs=False,
-    return_feature_selection=False,
-    **kwargs,
 ):
     # Create cleanded rdkit mol objects
     mols, clean_mol_mask = create_cleaned_mol_objects(smiles)
-    print("Cleaned molecules")
-    tox_patterns = get_tox_patterns(TOX_SMARTS_PATH)
     # Create fingerprints and descriptors
-    ecfps = create_ecfp_fps(mols, **kwargs)
-    # expand using mol_mask
-    # ecfps = fill(ecfps, ~clean_mol_mask)
-    print("Created ECFP fingerprints")
-    # print("ecfps features:", ecfps.shape)
-    tox = create_tox_features(mols, tox_patterns)
-    # tox = fill(tox, ~clean_mol_mask)
-    print("Created Tox features")
-    # print("tox features:", tox.shape)
-    # Create and save feature selection for ecfps and tox
-    # if feature_selection is None:
-    #     print("Create Feature selection")
-    #     ecfps_selec = get_feature_selection(ecfps, **kwargs)
-    #     tox_selec = get_feature_selection(tox, **kwargs)
-    #     feature_selection = {"ecfps_selec": ecfps_selec, "tox_selec": tox_selec}
-    # else:
-    #     ecfps_selec = feature_selection["ecfps_selec"]
-    #     tox_selec = feature_selection["tox_selec"]
-    # ecfps = ecfps[:, ecfps_selec]
-    # tox = tox[:, tox_selec]
-    maccs = create_maccs_keys(mols)
-    # maccs = fill(maccs, ~clean_mol_mask)
-    print("Created MACCS keys")
-    rdkit_descrs = create_rdkit_descriptors(mols)
-    # rdkit_descrs = fill(rdkit_descrs, ~clean_mol_mask)
-    print("Created RDKit descriptors")
-    # # Create and save ecdfs
-    # if ecdfs is None:
-    #     print("Create ECDFs")
-    #     ecdfs = []
-    #     for column in range(rdkit_descrs.shape[1]):
-    #         raw_values = rdkit_descrs[:, column].reshape(-1)
-    #         ecdfs.append(ECDF(raw_values))
-    # # Create quantiles
-    # rdkit_descr_quantiles = create_quantiles(rdkit_descrs, ecdfs)
-    # # expand using mol_mask
-    # rdkit_descr_quantiles = fill(rdkit_descr_quantiles, ~clean_mol_mask)
-    # print("Created quantiles of RDKit descriptors")
     # concatenate features
-    # features = {
-    #     "ecfps": ecfps,
-    #     "tox": tox,
-    #     "maccs": maccs,
-    #     "rdkit_descr_quantiles": rdkit_descr_quantiles,
-    # }
-    # for feat in [ecfps, tox, maccs, rdkit_descrs]:
-    #     print(feat.shape)
-    features = np.concat((ecfps, tox, maccs, rdkit_descrs), axis=1)
-    # return_dict = {"features": features}
-    # if return_ecdfs:
-    #     return_dict["ecdfs"] = ecdfs
-    # if return_feature_selection:
-    #     return_dict["feature_selection"] = feature_selection
-    return features, clean_mol_mask
-def get_ecdfs(raw_features: np.ndarray, **kwargs) -> np.ndarray:
-    ecdfs = []
-    for column in range(raw_features.shape[1]):
-        raw_values = raw_features[:, column].reshape(-1)
-        ecdfs.append(ECDF(raw_values))
-    return ecdfs
-def get_feature_selection(
-    raw_features: np.ndarray, min_var=0.01, max_corr=0.95, **kwargs
-) -> np.ndarray:
-    # select features with at least min_var variation
-    var_thresh = VarianceThreshold(threshold=min_var)
-    feature_selection = var_thresh.fit(raw_features).get_support(indices=True)
-    n_features_preselected = len(feature_selection)
-    # Remove highly correlated features
-    corr_matrix = np.corrcoef(raw_features[:, feature_selection], rowvar=False)
-    upper_tri = np.triu(corr_matrix, k=1)
-    to_keep = np.ones((n_features_preselected,), dtype=bool)
-    for i in range(upper_tri.shape[0]):
-        for j in range(upper_tri.shape[1]):
-            if upper_tri[i, j] > max_corr:
-                to_keep[j] = False
-    feature_selection = feature_selection[to_keep]
-    return feature_selection
 def get_tox21_split(token, cvfold=None):

 SMILES and target names as keys.
 """
+import copy
 import json
+from typing import Any
 import numpy as np
 import pandas as pd
 from datasets import load_dataset
+from sklearn.base import BaseEstimator, TransformerMixin
 from sklearn.feature_selection import VarianceThreshold
+from sklearn.preprocessing import StandardScaler, FunctionTransformer
 from statsmodels.distributions.empirical_distribution import ECDF
 from rdkit import Chem, DataStructs
 from rdkit.Chem import Descriptors, rdFingerprintGenerator, MACCSkeys
 from rdkit.Chem.rdchem import Mol
+from .utils import USED_200_DESCR, TOX_SMARTS_PATH, Standardizer, FeatureDictMixin
+class SquashScaler(TransformerMixin, BaseEstimator):
+    """
+    Scaler that performs sequential standardization, nonlinearity (tanh), and
+    re-standardization. Inspired by DeepTox (Mayr et al., 2016)
+    """
+    def __init__(self):
+        self.scaler1 = StandardScaler()
+        self.scaler2 = StandardScaler()
+    def fit(self, X):
+        _X = X.copy()
+        _X = self.scaler1.fit_transform(_X)
+        _X = np.tanh(_X)
+        _X = self.scaler2.fit(_X)
+        return self
+    def transform(self, X):
+        _X = X.copy()
+        _X = self.scaler1.transform(_X)
+        _X = np.tanh(_X)
+        return self.scaler2.transform(_X)
+SCALER_REGISTRY = {
+    "none": FunctionTransformer,
+    "standard": StandardScaler,
+    "squash": SquashScaler,
+}
+class SubSampler(TransformerMixin, BaseEstimator):
+    """
+    Preprocessor that randomly samples `max_samples` from data.
+    Args:
+        max_samples (int): Maximum allowed samples. If -1, all samples are retained.
+    Input:
+        np.ndarray: A 2D NumPy array of shape (n_samples, n_features).
+    Output:
+        np.ndarray: Subsampled array of shape (min(n_samples, max_samples), n_features).
+    """
+    def __init__(self, *, max_samples=-1):
+        self.max_samples = max_samples
+        self.is_fitted_ = True
+    def fit(self, X: np.ndarray, y: np.ndarray | None = None):
+        return self
+    def transform(
+        self, X: np.ndarray, y: np.ndarray | None = None
+    ) -> np.ndarray | tuple[np.ndarray]:
+        _X = X.copy()
+        _y = y.copy() if y is not None else None
+        if self.max_samples > 0:
+            resample_idxs = np.random.choice(
+                np.arange(_X.shape[0]), size=(self.max_samples,), replace=True
+            )
+            _X = _X[resample_idxs]
+            _y = _y[resample_idxs] if _y is not None else None
+        if _y is None:
+            return _X
+        return _X, _y
+class FeatureSelector(FeatureDictMixin, TransformerMixin, BaseEstimator):
+    """
+    Preprocessor that performs feature selection based on variance and correlation.
+    This transformer selects features that:
+    1. Have variance above a specified threshold.
+    2. Are below a given pairwise correlation threshold.
+    3. Among the remaining features, keeps only the top `max_features` with the highest variance.
+    The input and output are both dictionaries mapping feature types to their corresponding
+    feature matrices.
+    Args:
+        min_var (float): Minimum variance required for a feature to be retained.
+        max_corr (float): Maximum allowed correlation between features.
+            Features exceeding this threshold with others are removed.
+        max_features (int): Maximum number of features to keep after filtering.
+            If -1, all remaining features are retained.
+    Input:
+        dict[str, np.ndarray]: A dictionary where each key corresponds to a feature type
+        and each value is a 2D NumPy array of shape (n_samples, n_features).
+    Output:
+        dict[str, np.ndarray]: A dictionary with the same keys as the input,
+        containing only the selected features for each feature type.
+    """
+    def __init__(
+        self, *, min_var=0.0, max_corr=1.0, max_features=-1, feature_keys=None
+    ):
+        self.min_var = min_var
+        self.max_corr = max_corr
+        self.max_features = max_features
+        self._feature_mask = None
+        super().__init__(feature_keys=feature_keys)
+    def fit(self, X: dict[str, np.ndarray]):
+        _X = self.dict_to_array(X)
+        # select features with at least min_var variation
+        if self.min_var > 0.0:
+            var_thresh = VarianceThreshold(threshold=self.min_var)
+            feature_mask = var_thresh.fit(_X).get_support()  # mask
+        # select features with at least max_var variation
+        if self.max_corr < 1.0:
+            corr_matrix = np.corrcoef(_X[:, feature_mask], rowvar=False)
+            upper_tri = np.triu(corr_matrix, k=1)
+            to_keep = np.ones((sum(feature_mask),), dtype=bool)
+            for i in range(upper_tri.shape[0]):
+                for j in range(upper_tri.shape[1]):
+                    if upper_tri[i, j] > self.max_corr:
+                        to_keep[j] = False
+            feature_mask[feature_mask] = to_keep
+        if self.max_features == 0:
+            raise ValueError(
+                f"max_features (={self.max_features}) must be -1 or larger 0."
+            )
+        elif self.max_features > 0:
+            # select features with at least max_var variation
+            feature_vars = np.nanvar(_X[:, feature_mask], axis=0)
+            order = np.argsort(feature_vars)[: -(self.max_features + 1) : -1]
+            keep_feat_idx = np.arange(feature_mask)[order]
+            feature_mask = np.isin(
+                np.arange(feature_mask), keep_feat_idx, assume_unique=True
+            )
+        self._feature_mask = feature_mask
+        self.is_fitted_ = True
+        return self
+    def transform(self, X: dict[str, np.ndarray]) -> dict[str, np.ndarray]:
+        _X = self.dict_to_array(X)
+        _X = _X[:, self._feature_mask]
+        self._curr_keys = self._curr_keys[self._feature_mask]
+        return self.array_to_dict(_X)
+class QuantileCreator(FeatureDictMixin, TransformerMixin, BaseEstimator):
+    """
+    Preprocessor that transforms features into empirical quantiles using ECDFs.
+    This transformer applies an Empirical Cumulative Distribution Function (ECDF)
+    to each feature and replaces feature values with their corresponding quantile
+    ranks. The transformation is applied independently to each feature type.
+    Both input and output are dictionaries mapping feature types to their
+    corresponding feature matrices.
+    Input:
+        dict[str, np.ndarray]: A dictionary where each key corresponds to a feature type
+        and each value is a 2D NumPy array of shape (n_samples, n_features).
+    Output:
+        dict[str, np.ndarray]: A dictionary with the same keys as the input,
+        where each feature value is replaced by its corresponding ECDF quantile rank.
+    """
+    def __init__(self, *, feature_keys=None):
+        self._ecdfs = None
+        super().__init__(feature_keys=feature_keys)
+    def fit(self, X: dict[str, np.ndarray]):
+        _X = self.dict_to_array(X)
+        ecdfs = []
+        for column in range(_X.shape[1]):
+            raw_values = _X[:, column].reshape(-1)
+            ecdfs.append(ECDF(raw_values))
+        self._ecdfs = ecdfs
+        self.is_fitted_ = True
+        return self
+    def transform(self, X: dict[str, np.ndarray]) -> np.ndarray:
+        _X = self.dict_to_array(X)
+        quantiles = np.zeros_like(_X)
+        for column in range(_X.shape[1]):
+            raw_values = _X[:, column].reshape(-1)
+            ecdf = self._ecdfs[column]
+            q = ecdf(raw_values)
+            quantiles[:, column] = q
+        return self.array_to_dict(quantiles)
+class FeaturePreprocessor(TransformerMixin, BaseEstimator):
+    """This class implements the feature preprocessing from a dictionary of molecule features."""
+    def __init__(
+        self,
+        feature_selection_config: dict[str, Any],
+        feature_quantilization_config: dict[str, Any],
+        descriptors: list[str],
+        max_samples: int = -1,
+        scaler: str = "standard",
+    ):
+        self.descriptors = descriptors
+        self.feature_quantilization_config = feature_quantilization_config
+        self.use_feat_quant = self.feature_quantilization_config.pop("use")
+        self.quantile_creator = QuantileCreator(**feature_quantilization_config)
+        self.feature_selection_config = feature_selection_config
+        self.use_feat_selec = self.feature_selection_config.pop("use")
+        self.feature_selector = FeatureSelector(**feature_selection_config)
+        self.max_samples = max_samples
+        self.sub_sampler = SubSampler(max_samples=max_samples)
+        self.scaler = SCALER_REGISTRY[scaler]()
+    def __getstate__(self):
+        state = super().__getstate__()
+        state["quantile_creator"] = self.quantile_creator.__getstate__()
+        state["feature_selector"] = self.feature_selector.__getstate__()
+        state["sub_sampler"] = self.sub_sampler.__getstate__()
+        state["scaler"] = self.scaler.__getstate__()
+        return state
+    def __setstate__(self, state):
+        _state = copy.deepcopy(state)
+        self.quantile_creator.__setstate__(_state.pop("quantile_creator"))
+        self.feature_selector.__setstate__(_state.pop("feature_selector"))
+        self.sub_sampler.__setstate__(_state.pop("sub_sampler"))
+        self.scaler.__setstate__(_state.pop("scaler"))
+        super().__setstate__(_state)
+    def fit(self, X: dict[str, np.ndarray]):
+        """Fit the processor transformers"""
+        _X = copy.deepcopy(X)
+        if self.use_feat_quant:
+            _X = self.quantile_creator.fit_transform(_X)
+        if self.use_feat_selec:
+            _X = self.feature_selector.fit_transform(_X)
+        _X = np.concatenate([_X[descr] for descr in self.descriptors], axis=1)
+        self.scaler.fit(_X)
+        return self
+    def transform(
+        self, X: np.ndarray, y: np.ndarray | None = None
+    ) -> np.ndarray | tuple[np.ndarray]:
+        _X = X.copy()
+        _y = y.copy() if y is not None else None
+        if self.use_feat_quant:
+            _X = self.quantile_creator.transform(_X)
+        if self.use_feat_selec:
+            _X = self.feature_selector.transform(_X)
+        _X = np.concatenate([_X[descr] for descr in self.descriptors], axis=1)
+        _X = self.scaler.transform(_X)
+        if _y is None:
+            _X = self.sub_sampler.transform(_X)
+            return _X
+        _X, _y = self.sub_sampler.transform(_X, _y)
+        return _X, _y
 def create_cleaned_mol_objects(smiles: list[str]) -> tuple[list[Mol], np.ndarray]:
 def create_descriptors(
     smiles,
+    descriptors,
+    **ecfp_kwargs,
 ):
+    """Generate molecular descriptors for multiple SMILES strings.
+    Each SMILES is processed and sanitized using RDKit.
+    SMILES that cannot be sanitized are encoded with NaNs, and a corresponding boolean mask
+    is returned to indicate which inputs were successfully processed.
+    Args:
+        smiles (list[str]): List of SMILES strings for which to generate descriptors.
+        descriptors (list[str]): List of descriptor types to compute.
+            Supported values include:
+            ['ecfps', 'tox', 'maccs', 'rdkit_descrs'].
+    Returns:
+        tuple[dict[str, np.ndarray], np.ndarray]:
+            - A dictionary mapping descriptor names to their computed arrays.
+            - A boolean mask of shape (len(smiles),) indicating which SMILES
+            were successfully sanitized and processed.
+    """
     # Create cleanded rdkit mol objects
     mols, clean_mol_mask = create_cleaned_mol_objects(smiles)
+    print(f"Cleaned molecules, {(~clean_mol_mask).sum()} could not be sanitized")
     # Create fingerprints and descriptors
+    if "ecfps" in descriptors:
+        ecfps = create_ecfp_fps(mols, **ecfp_kwargs)
+        ecfps = fill(ecfps, ~clean_mol_mask)
+        print("Created ECFP fingerprints")
+    if "tox" in descriptors:
+        tox_patterns = get_tox_patterns(TOX_SMARTS_PATH)
+        tox = create_tox_features(mols, tox_patterns)
+        tox = fill(tox, ~clean_mol_mask)
+        print("Created Tox features")
+    if "maccs" in descriptors:
+        maccs = create_maccs_keys(mols)
+        maccs = fill(maccs, ~clean_mol_mask)
+        print("Created MACCS keys")
+    if "rdkit_descrs" in descriptors:
+        rdkit_descrs = create_rdkit_descriptors(mols)
+        rdkit_descrs = fill(rdkit_descrs, ~clean_mol_mask)
+        print("Created RDKit descriptors")
     # concatenate features
+    features = {}
+    for descr in descriptors:
+        features[descr] = vars()[descr]
+    return features, clean_mol_mask
 def get_tox21_split(token, cvfold=None):

src/utils.py CHANGED Viewed

@@ -7,6 +7,9 @@
 import os
 import pickle
 from rdkit import Chem
 from rdkit.Chem.MolStandardize import rdMolStandardize
@@ -29,7 +32,7 @@ TASKS = [
     "SR-p53",
 ]
-KNOWN_DESCR = ["ecfps", "rdkit_descr_quantiles", "maccs", "tox"]
 USED_200_DESCR = [
     0,
@@ -433,6 +436,63 @@ class Standardizer:
         return mol_out, n_tautomers
 def load_pickle(path: str):
     with open(path, "rb") as file:
         content = pickle.load(file)
@@ -459,7 +519,7 @@ def normalize_config(config: dict):
     for key, val in config.items():
         if isinstance(val, dict):
             new_config[key] = normalize_config(val)
-        elif val in mapping:
             new_config[key] = mapping[val]
         else:
             new_config[key] = val

 import os
 import pickle
+from typing import Any
+import numpy as np
 from rdkit import Chem
 from rdkit.Chem.MolStandardize import rdMolStandardize
     "SR-p53",
 ]
+KNOWN_DESCR = ["ecfps", "tox", "maccs", "rdkit_descrs"]
 USED_200_DESCR = [
     0,
         return mol_out, n_tautomers
+class FeatureDictMixin:
+    """
+    Mixin that enables bidirectional handling of dict-based multi-feature inputs.
+    Allows selective removal of columns directly from the combined array.
+    Example input:
+        {
+            "ecfps": np.ndarray,
+            "tox": np.ndarray,
+        }
+    """
+    def __init__(self, feature_keys=None):
+        self.feature_keys = feature_keys
+        self._curr_keys = None
+        self._unused_data = None
+    def dict_to_array(self, input: dict[Any, np.ndarray]) -> np.ndarray:
+        """Parse dict input and concatenate into a single array."""
+        if not isinstance(input, dict):
+            raise TypeError("Input must be a dict {feature_type: np.ndarray, ...}")
+        self._unused_data = {}
+        remaining_input = {}
+        for key in list(input.keys()):
+            if key not in self.feature_keys:
+                self._unused_data[key] = input[key]
+            else:
+                remaining_input[key] = input[key]
+        curr_keys = []
+        output = []
+        for key in self.feature_keys:
+            array = remaining_input.pop(key)
+            if array.ndim != 2:
+                raise ValueError(f"Feature '{key}' must be 2D, got shape {array.shape}")
+            curr_keys.extend([key] * array.shape[1])
+            output.append(array)
+        self._curr_keys = np.array(curr_keys)
+        return np.concatenate(output, axis=1)
+    def array_to_dict(self, input: np.ndarray) -> dict[Any, np.ndarray]:
+        """Reconstruct dict from a concatenated array."""
+        if self._curr_keys is None:
+            raise ValueError("No feature mapping stored. Did you call parse_input()?")
+        output = {key: input[:, self._curr_keys == key] for key in self.feature_keys}
+        output.update(self._unused_data)
+        self._curr_keys = None
+        self._unused_data = None
+        return output
 def load_pickle(path: str):
     with open(path, "rb") as file:
         content = pickle.load(file)
     for key, val in config.items():
         if isinstance(val, dict):
             new_config[key] = normalize_config(val)
+        elif isinstance(val, (int, float, str)) and val in mapping:
             new_config[key] = mapping[val]
         else:
             new_config[key] = val

train.py CHANGED Viewed

@@ -4,6 +4,7 @@ Script for fitting and saving any preprocessing assets, as well as the fitted RF
 import os
 import json
 import random
 import logging
 import argparse
@@ -12,11 +13,8 @@ import numpy as np
 from datetime import datetime
 from src.model import Tox21RFClassifier
-from src.utils import (
-    create_dir,
-    normalize_config,
-    USED_200_DESCR,
-)
 parser = argparse.ArgumentParser(description="RF Training script for Tox21 dataset")
@@ -27,7 +25,7 @@ parser.add_argument(
 )
-def main(cfg):
     timestamp = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
     # setup logger
@@ -39,7 +37,7 @@ def main(cfg):
         handlers=[
             logging.FileHandler(
                 os.path.join(
-                    cfg["log_folder"],
                     f"{script_name}_{timestamp}.log",
                 )
             ),
@@ -47,50 +45,50 @@ def main(cfg):
         ],
     )
-    task_configs = cfg.pop("task_configs")
-    logger.info(f"Config: {cfg}")
-    task_configs_repr = "Task configs: \n" + "\n".join(
-        [str(val) for key, val in task_configs.items()]
     )
-    logger.info(f"Task configs: \n{task_configs_repr}")
     # seeding
-    random.seed(cfg["seed"])
-    np.random.seed(cfg["seed"])
-    train_data = np.load(os.path.join(cfg["data_folder"], "tox21_train_cv4.npz"))
-    train_X = train_data[
-        "features"
-    ]  # np.concatenate([train_data[descr] for descr in KNOWN_DESCR], axis=1)
-    train_y = train_data["labels"]
-    val_data = np.load(os.path.join(cfg["data_folder"], "tox21_validation_cv4.npz"))
-    val_X = val_data[
-        "features"
-    ]  # np.concatenate([val_data[descr] for descr in KNOWN_DESCR], axis=1)
-    val_y = val_data["labels"]
-    data = np.concatenate([train_X, val_X], axis=0)
-    labels = np.concatenate([train_y, val_y], axis=0)
-    logger.info(f"Train data shape: {data.shape}")
-    if cfg["model_path"]:
         logger.info(
-            f"Fitted RandomForestClassifier will be saved as: {cfg['model_path']}"
         )
     else:
         logger.info("Fitted RandomForestClassifier will NOT be saved.")
-    rdkit_descr_idxs = np.arange(data.shape[1] - len(USED_200_DESCR), data.shape[1])
-    model = Tox21RFClassifier(
-        seed=cfg["seed"],
-        task_config=task_configs,
-        rdkit_desc_idxs=rdkit_descr_idxs,
-    )
-    model.fit_preprocessing(
-        data, min_var=cfg["feature_minvar"], max_corr=cfg["feature_maxcorr"]
     )
     logger.info("Start training.")
     for i, task in enumerate(model.tasks):
@@ -98,28 +96,34 @@ def main(cfg):
         label_mask = ~np.isnan(task_labels)
         logger.info(f"Fit task {task} using {sum(label_mask)} samples")
-        task_data = data[label_mask]
         task_labels = task_labels[label_mask].astype(int)
         model.fit(task, task_data, task_labels)
-        if cfg["debug"]:
             break
     log_text = f"Finished training."
     logger.info(log_text)
-    if cfg["model_path"]:
-        model.save_model(cfg["model_path"])
-        logger.info(f"Save model as: {cfg['model_path']}")
 if __name__ == "__main__":
     args = parser.parse_args()
     with open(args.config, "r") as f:
-        cfg = json.load(f)
-    cfg = normalize_config(cfg)
-    create_dir(cfg["log_folder"])
-    main(cfg)

 import os
 import json
+import joblib
 import random
 import logging
 import argparse
 from datetime import datetime
 from src.model import Tox21RFClassifier
+from src.preprocess import Tox21Preprocessor
+from src.utils import create_dir, normalize_config,
 parser = argparse.ArgumentParser(description="RF Training script for Tox21 dataset")
 )
+def main(config):
     timestamp = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
     # setup logger
         handlers=[
             logging.FileHandler(
                 os.path.join(
+                    config["log_folder"],
                     f"{script_name}_{timestamp}.log",
                 )
             ),
         ],
     )
+    logger.info(f"Config: {config}")
+    model_configs_repr = "Model configs: \n" + "\n".join(
+        [str(val) for val in config["model_configs"].values()]
     )
+    logger.info(f"Model configs: \n{model_configs_repr}")
     # seeding
+    random.seed(config["seed"])
+    np.random.seed(config["seed"])
+    train_data = np.load(os.path.join(config["data_folder"], "tox21_train_cv4.npz"))
+    val_data = np.load(os.path.join(config["data_folder"], "tox21_validation_cv4.npz"))
+    # filter out unsanitized molecules
+    train_is_clean = train_data["clean_mol_mask"]
+    val_is_clean = val_data["clean_mol_mask"]
+    train_data = {descr: array[train_is_clean] for descr, array in train_data.items()}
+    val_data = {descr: array[val_is_clean] for descr, array in val_data.items()}
+    # combine datasets
+    data = {
+        descr: np.concatenate([train_data[descr], val_data[descr]], axis=0)
+        for descr in config["descriptors"]
+    }
+    labels = np.concatenate([train_data["labels"], val_data["labels"]], axis=0)
+    if config["ckpt_path"]:
         logger.info(
+            f"Fitted RandomForestClassifier will be saved as: {config['ckpt_path']}"
         )
     else:
         logger.info("Fitted RandomForestClassifier will NOT be saved.")
+    model = Tox21RFClassifier(seed=config["seed"], config=config["model_configs"])
+    # setup processors
+    preprocessor = Tox21Preprocessor(
+        feature_selection_config=config["feature_selection"],
+        feature_quantilization_config=config["feature_quantilization"],
+        descriptors=config["descriptors"],
+        max_samples=config["max_samples"],
+        scaler=config["scaler"],
     )
+    preprocessor.fit(data)
     logger.info("Start training.")
     for i, task in enumerate(model.tasks):
         label_mask = ~np.isnan(task_labels)
         logger.info(f"Fit task {task} using {sum(label_mask)} samples")
+        task_data = {key: val[label_mask] for key, val in data.items()}
         task_labels = task_labels[label_mask].astype(int)
+        task_data = preprocessor.transform(task_data)
         model.fit(task, task_data, task_labels)
+        if config["debug"]:
             break
     log_text = f"Finished training."
     logger.info(log_text)
+    if config["ckpt_path"]:
+        ckpt = {
+            "preprocessor": preprocessor.__getstate__(),
+            "models": model.get_state(),
+        }
+        # model.save_model(config["ckpt_path"])
+        joblib.dump(ckpt, config["ckpt_path"])
+        logger.info(f"Save model as: {config['ckpt_path']}")
 if __name__ == "__main__":
     args = parser.parse_args()
     with open(args.config, "r") as f:
+        config = json.load(f)
+    config = normalize_config(config)
+    create_dir(config["log_folder"])
+    main(config)