Closed jyaacoub closed 1 week ago
Number of pocket sequences found from KLIFS:
PDBbindDataset: 177/3785 (3608)
davis: 321/442 (121)
kiba: 221/228 (7)
```python # %% import pandas as pd from src.data_prep.downloaders import Downloader df = pd.read_csv('../data/all_prots.csv') id_status = {} for db in df.db.unique(): id = Downloader.download_pocket_seq(df[df.db == db].prot_id.to_list(), f"../data/pocket_seq/{db}/", tqdm_desc=f"Downloading {db} pocket sequences") id_status[db] = id #%% import json # json.dump(id_status, open('../data/pocket_seq/seq_out.json', 'w')) # id_status = json.load(open('../data/pocket_seq/seq_out.json', 'r')) for db, st in id_status.items(): total_ids = len(st) missing = list(id_status[db].values()).count(400) print(f"{db}: {total_ids - missing}/{total_ids} ({missing})") ```
ABL1(E55K)
instead of just ABL1
)import pandas as pd
from src.data_prep.downloaders import Downloader
df = pd.read_csv('../data/all_prots.csv')
id_status = {}
for db in ['davis']:#df.db.unique():
if db == 'davis':
gene_names = df[df.db == db].prot_id.to_list()
ids = [gene.split('(')[0] for gene in gene_names] # get rid of mutation specifiers
# get rid of phospho-specifiers
ids = [gene.split('-')[0] for gene in ids]
# get rid of trailing p:
ids = [gene.split('p')[0] for gene in ids]
else:
ids = df[df.db == db].prot_id.to_list()
id = Downloader.download_pocket_seq(ids,
f"../data/pocket_seq/{db}/",
tqdm_desc=f"Downloading {db} pocket sequences")
id_status[db] = id
#%%
import json
# json.dump(id_status, open('../data/pocket_seq/seq_out.json', 'w'))
# id_status = json.load(open('../data/pocket_seq/seq_out.json', 'r'))
for db, st in id_status.items():
total_ids = len(st)
missing = list(id_status[db].values()).count(400)
print(f"{db}: {total_ids - missing}/{total_ids} ({missing})")
Getting pockets for Kiba
/kinase_ID
API.Getting pockets for davis:
Same as for kiba, but we use the raw Gene Name code (need to remove any mutation or phosphorylation information):
ABL1(F317I)p
->ABL1