""" This files includes a predict function for the Tox21. As an input it takes a list of SMILES and it outputs a nested dictionary with SMILES and target names as keys. """ # --------------------------------------------------------------------------------------- # Dependencies import json import copy from collections import defaultdict import joblib import numpy as np from tqdm import tqdm from src.model import Tox21RFClassifier from src.preprocess import create_descriptors, FeaturePreprocessor from src.utils import TASKS, normalize_config # --------------------------------------------------------------------------------------- CONFIG_FILE = "./config/config.json" def predict( smiles_list: list[str], default_prediction: float = 0.5 ) -> dict[str, dict[str, float]]: """Applies the classifier to a list of SMILES strings. Returns prediction=0.0 for any molecule that could not be cleaned. Args: smiles_list (list[str]): list of SMILES strings Returns: dict: nested prediction dictionary, following {'': {'': }} """ print(f"Received {len(smiles_list)} SMILES strings") with open(CONFIG_FILE, "r") as f: config = json.load(f) config = normalize_config(config) features, is_clean = create_descriptors( smiles_list, config["descriptors"], **config["ecfp"] ) print(f"Created descriptors for {sum(is_clean)} molecules.") print(f"{len(is_clean) - sum(is_clean)} molecules removed during cleaning") # setup model model = Tox21RFClassifier() preprocessor = FeaturePreprocessor( feature_selection_config=config["feature_selection"], feature_quantilization_config=config["feature_quantilization"], descriptors=config["descriptors"], max_samples=config["max_samples"], scaler=config["scaler"], ) model.load(config["ckpt_path"]) print(f"Loaded model from {config['ckpt_path']}") state = joblib.load(config["preprocessor_path"]) preprocessor.set_state(state) print(f"Loaded preprocessor from {config['preprocessor_path']}") # make predicitons predictions = defaultdict(dict) print(f"Create predictions:") preds = [] for target in tqdm(TASKS): X = copy.deepcopy(features) X = {descr: array[is_clean] for descr, array in X.items()} X = preprocessor.transform(X) preds = np.empty_like(is_clean, dtype=np.float64) preds[~is_clean] = default_prediction preds[is_clean] = model.predict(target, X) for smiles, pred in zip(smiles_list, preds): predictions[smiles][target] = float(pred) if config["debug"]: break return predictions