Open yanbosmu opened 3 months ago
I rewrote train_ligand_binding_model.py
import pickle import pandas as pd import numpy as np
from rdkit import Chem from rdkit.Chem import AllChem
from sklearn.ensemble import RandomForestRegressor from sklearn.model_selection import train_test_split from sklearn.model_selection import KFold
from pathlib import Path
import logging
def train_ligand_binding_model(target_unit_pro_id, binding_db_path, output_path): binddb = pd.read_csv(binding_db_path, sep="\t", header=0, low_memory=False, error_bad_lines=False)
d = binddb[binddb['UniProt (SwissProt) Primary ID of Target Chain'] == target_unit_pro_id]
d = d[['Ligand SMILES', 'IC50 (nM)', 'Kd (nM)']]
d.columns = ['smiles', 'ic50', 'kd50']
logging.debug(f'Number of obs: {d.shape[0]}:')
logging.debug(f'{d.head()}')
def calculate_metric(row):
i, j = row['ic50'], row['kd50']
try:
v = float(i)
except ValueError:
v = float(i[1:])
try:
w = float(j)
except ValueError:
w = float(j[1:])
t = pd.Series([v, w]).dropna().min()
if t <= 0 or np.isinf(t) or np.isnan(t):
return np.nan # Return NaN for invalid values
return -np.log10(t * 1E-9)
d['metric_value'] = d.apply(calculate_metric, axis=1)
d = d[['smiles', 'metric_value']]
d = d.drop_duplicates(subset='smiles')
d = d.dropna()
logging.debug(f'Number of obs: {d.shape[0]}:')
if d.shape[0] < 10:
logging.info('Less than 10 compound-target pairs. Not fitting a model')
return 1
# convert to fingerprint
fps = []
values = []
for x, y in d[['smiles', 'metric_value']].values:
mol = Chem.MolFromSmiles(x)
if mol is None:
continue # Skip invalid molecules
try:
fp = AllChem.GetMorganFingerprintAsBitVect(mol, 2)
except:
continue
fps.append(fp)
values.append(y)
X = np.array(fps)
y = np.array(values)
regr = RandomForestRegressor(n_estimators=1000, random_state=0, n_jobs=-1)
regr.fit(X, y)
regr.score(X, y)
logging.debug(regr.score(X, y))
if output_path is None:
output_path = f'{target_unit_pro_id}_rfr_ligand_model.pt'
with open(output_path, 'wb') as handle:
pickle.dump(regr, handle)
return 1
This program will exclude the wrong molecules. And dealing with infinity.
Is that OK for the whole program? Thank you
Traceback (most recent call last): File "/home/yanbosmu/桌面/your_path/mypolygon/bin/polygon", line 8, in
sys.exit(main())
File "/home/yanbosmu/桌面/your_path/mypolygon/lib/python3.9/site-packages/polygon/run.py", line 849, in main
r = train_ligand_binding_model_main(args)
File "/home/yanbosmu/桌面/your_path/mypolygon/lib/python3.9/site-packages/polygon/run.py", line 810, in train_ligand_binding_model_main train_ligand_binding_model( args.uniprot_id, File "/home/yanbosmu/桌面/your_path/mypolygon/lib/python3.9/site-packages/polygon/utils/train_ligand_binding_model.py", line 69, in train_ligand_binding_model regr.fit(X,y) File "/home/yanbosmu/桌面/your_path/mypolygon/lib/python3.9/site-packages/sklearn/base.py", line 1473, in wrapper return fit_method(estimator, *args, kwargs) File "/home/yanbosmu/桌面/your_path/mypolygon/lib/python3.9/site-packages/sklearn/ensemble/_forest.py", line 363, in fit X, y = self._validate_data( File "/home/yanbosmu/桌面/your_path/mypolygon/lib/python3.9/site-packages/sklearn/base.py", line 650, in _validate_data X, y = check_X_y(X, y, check_params) File "/home/yanbosmu/桌面/your_path/mypolygon/lib/python3.9/site-packages/sklearn/utils/validation.py", line 1318, in check_X_y y = _check_y(y, multi_output=multi_output, y_numeric=y_numeric, estimator=estimator) File "/home/yanbosmu/桌面/your_path/mypolygon/lib/python3.9/site-packages/sklearn/utils/validation.py", line 1328, in _check_y y = check_array( File "/home/yanbosmu/桌面/your_path/mypolygon/lib/python3.9/site-packages/sklearn/utils/validation.py", line 1064, in check_array _assert_all_finite( File "/home/yanbosmu/桌面/your_path/mypolygon/lib/python3.9/site-packages/sklearn/utils/validation.py", line 123, in _assert_all_finite _assert_all_finite_element_wise( File "/home/yanbosmu/桌面/your_path/mypolygon/lib/python3.9/site-packages/sklearn/utils/validation.py", line 172, in _assert_all_finite_element_wise raise ValueError(msg_err) ValueError: Input y contains infinity or a value too large for dtype('float64')
I successfully ran train_ligand_binding_model in some PDB, such as Q9Y572 and Q13546, but I got errors both from the examples in your tutorial with PDB Q02750 and P42345?
ANY clues about this? Thank you !