bpmunson / polygon

POLYGON VAE For de novo Polypharmacology
MIT License
27 stars 8 forks source link

train_ligand_binding_model error. #8

Open yanbosmu opened 1 month ago

yanbosmu commented 1 month ago

Traceback (most recent call last): File "/home/yanbosmu/桌面/your_path/mypolygon/bin/polygon", line 8, in sys.exit(main()) File "/home/yanbosmu/桌面/your_path/mypolygon/lib/python3.9/site-packages/polygon/run.py", line 849, in main r = train_ligand_binding_model_main(args)
File "/home/yanbosmu/桌面/your_path/mypolygon/lib/python3.9/site-packages/polygon/run.py", line 810, in train_ligand_binding_model_main train_ligand_binding_model( args.uniprot_id, File "/home/yanbosmu/桌面/your_path/mypolygon/lib/python3.9/site-packages/polygon/utils/train_ligand_binding_model.py", line 69, in train_ligand_binding_model regr.fit(X,y) File "/home/yanbosmu/桌面/your_path/mypolygon/lib/python3.9/site-packages/sklearn/base.py", line 1473, in wrapper return fit_method(estimator, *args, kwargs) File "/home/yanbosmu/桌面/your_path/mypolygon/lib/python3.9/site-packages/sklearn/ensemble/_forest.py", line 363, in fit X, y = self._validate_data( File "/home/yanbosmu/桌面/your_path/mypolygon/lib/python3.9/site-packages/sklearn/base.py", line 650, in _validate_data X, y = check_X_y(X, y, check_params) File "/home/yanbosmu/桌面/your_path/mypolygon/lib/python3.9/site-packages/sklearn/utils/validation.py", line 1318, in check_X_y y = _check_y(y, multi_output=multi_output, y_numeric=y_numeric, estimator=estimator) File "/home/yanbosmu/桌面/your_path/mypolygon/lib/python3.9/site-packages/sklearn/utils/validation.py", line 1328, in _check_y y = check_array( File "/home/yanbosmu/桌面/your_path/mypolygon/lib/python3.9/site-packages/sklearn/utils/validation.py", line 1064, in check_array _assert_all_finite( File "/home/yanbosmu/桌面/your_path/mypolygon/lib/python3.9/site-packages/sklearn/utils/validation.py", line 123, in _assert_all_finite _assert_all_finite_element_wise( File "/home/yanbosmu/桌面/your_path/mypolygon/lib/python3.9/site-packages/sklearn/utils/validation.py", line 172, in _assert_all_finite_element_wise raise ValueError(msg_err) ValueError: Input y contains infinity or a value too large for dtype('float64')

I successfully ran train_ligand_binding_model in some PDB, such as Q9Y572 and Q13546, but I got errors both from the examples in your tutorial with PDB Q02750 and P42345?

ANY clues about this? Thank you !

yanbosmu commented 1 month ago

I rewrote train_ligand_binding_model.py

import pickle import pandas as pd import numpy as np

from rdkit import Chem from rdkit.Chem import AllChem

from sklearn.ensemble import RandomForestRegressor from sklearn.model_selection import train_test_split from sklearn.model_selection import KFold

from pathlib import Path

import logging

def train_ligand_binding_model(target_unit_pro_id, binding_db_path, output_path): binddb = pd.read_csv(binding_db_path, sep="\t", header=0, low_memory=False, error_bad_lines=False)

d = binddb[binddb['UniProt (SwissProt) Primary ID of Target Chain'] == target_unit_pro_id]
d = d[['Ligand SMILES', 'IC50 (nM)', 'Kd (nM)']]
d.columns = ['smiles', 'ic50', 'kd50']

logging.debug(f'Number of obs: {d.shape[0]}:')
logging.debug(f'{d.head()}')

def calculate_metric(row):
    i, j = row['ic50'], row['kd50']
    try:
        v = float(i)
    except ValueError:
        v = float(i[1:])
    try:
        w = float(j)
    except ValueError:
        w = float(j[1:])

    t = pd.Series([v, w]).dropna().min()
    if t <= 0 or np.isinf(t) or np.isnan(t):
        return np.nan  # Return NaN for invalid values
    return -np.log10(t * 1E-9)

d['metric_value'] = d.apply(calculate_metric, axis=1)
d = d[['smiles', 'metric_value']]
d = d.drop_duplicates(subset='smiles')
d = d.dropna()

logging.debug(f'Number of obs: {d.shape[0]}:')

if d.shape[0] < 10:
    logging.info('Less than 10 compound-target pairs. Not fitting a model')
    return 1

# convert to fingerprint
fps = []
values = []
for x, y in d[['smiles', 'metric_value']].values:
    mol = Chem.MolFromSmiles(x)
    if mol is None:
        continue  # Skip invalid molecules
    try:
        fp = AllChem.GetMorganFingerprintAsBitVect(mol, 2)
    except:
        continue

    fps.append(fp)
    values.append(y)

X = np.array(fps)
y = np.array(values)

regr = RandomForestRegressor(n_estimators=1000, random_state=0, n_jobs=-1)
regr.fit(X, y)
regr.score(X, y)

logging.debug(regr.score(X, y))

if output_path is None:
    output_path = f'{target_unit_pro_id}_rfr_ligand_model.pt'

with open(output_path, 'wb') as handle:
    pickle.dump(regr, handle)

return 1

This program will exclude the wrong molecules. And dealing with infinity.

Is that OK for the whole program? Thank you