Spaces:
Sleeping
Sleeping
File size: 2,230 Bytes
81226cb 117eac3 486af19 81226cb 593848b a8d912f 81226cb 75c7791 81226cb 593848b 81226cb 593848b 81226cb 593848b 81226cb 593848b 81226cb 593848b 81226cb 75c7791 593848b 81226cb |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 |
"""
This files includes a predict function for the Tox21.
As an input it takes a list of SMILES and it outputs a nested dictionary with
SMILES and target names as keys.
"""
# ---------------------------------------------------------------------------------------
# Dependencies
from collections import defaultdict
import numpy as np
from src.data import create_descriptors
from src.utils import load_pickle, KNOWN_DESCR
from src.model import Tox21RFClassifier
# ---------------------------------------------------------------------------------------
def predict(smiles_list: list[str]) -> dict[str, dict[str, float]]:
"""Applies the classifier to a list of SMILES strings. Returns prediction=0.0 for
any molecule that could not be cleaned.
Args:
smiles_list (list[str]): list of SMILES strings
Returns:
dict: nested prediction dictionary, following {'<smiles>': {'<target>': <pred>}}
"""
print(f"Received {len(smiles_list)} SMILES strings")
# preprocessing pipeline
ecdfs_path = "assets/ecdfs.pkl"
scaler_path = "assets/scaler.pkl"
ecdfs = load_pickle(ecdfs_path)
scaler = load_pickle(scaler_path)
print(f"Loaded ecdfs from {ecdfs_path}")
print(f"Loaded scaler from {scaler_path}")
descriptors = KNOWN_DESCR
features, mol_mask = create_descriptors(
smiles_list,
ecdfs=ecdfs,
scaler=scaler,
descriptors=descriptors,
)
print(f"Created descriptors {descriptors} for molecules.")
print(f"{len(mol_mask) - sum(mol_mask)} molecules removed during cleaning")
# setup model
model = Tox21RFClassifier(seed=42)
model_path = "assets/rf_alltasks.joblib"
model.load_model(model_path)
print(f"Loaded model from {model_path}")
# make predicitons
predictions = defaultdict(dict)
# create a list with same length as smiles_list to obtain indices for respective features
feat_indices = np.cumsum(mol_mask) - 1
for target in model.tasks:
target_pred = model.predict(target, features)
for smiles, is_clean, i in zip(smiles_list, mol_mask, feat_indices):
predictions[smiles][target] = float(target_pred[i]) if is_clean else 0.0
return predictions
|