Closed jyaacoub closed 8 months ago
They use the entire pdb file for amber MD simulation.
Creation of the graph data objects is done with the GNNTransformMD
class which uses prot_graph_transform to do so:
This results in a Data object, that looks like this:
DataBatch(x=[3262, 11], edge_index=[2, 52466], edge_attr=[52466], y=[3262], pos=[3262, 3], ids=[1], batch=[3262], ptr=[2])
Where X is our nodes, 3262 is the number of atoms and 11 for the one hot vector for different atom types. Edge index is the same, edge_attr is a simple reciprocal Euclidian distance metric to acts as an edge weight representing the distances between nodes. The further they are the less the weight.
import pickle
import pandas as pd
from misato_dataset import MolDataset, ProtDataset, GNNTransformMD, GNNTransformQM
from torch_geometric.loader import DataLoader
from torch_geometric import transforms as T
from src.data_prep.feature_extraction.protein import Chain
from pathlib import Path
import pickle
HOME = Path.home()
misato_dir = f'{HOME}/projects/data/MISATO/'
misato_dir_tiny = f"{HOME}/projects/misato-dataset/data"
map_dir = f"{HOME}/projects/misato-dataset/src/misato_dataset/processing/Maps/"
atm_name = pickle.load(open(f"{map_dir}/atoms_name_map_for_pdb.pickle", 'rb'))
i_to_type = pickle.load(open(f"{map_dir}/atoms_type_map.pickle", 'rb'))
i_to_res = pickle.load(open(f"{map_dir}/atoms_residue_map.pickle", 'rb'))
mdh5_file = f'{misato_dir_tiny}/MD/h5_files/tiny_md_out.hdf5'
train_idx = f"{misato_dir_tiny}/MD/splits/train_tinyMD.txt"
val_idx = f"{misato_dir_tiny}/MD/splits/val_tinyMD.txt"
test_idx = f"{misato_dir_tiny}/MD/splits/test_tinyMD.txt"
p_id = '2G6P'.lower()
p_id = '10gs'.lower()
# %% LOAD PROTEIN DATA:
MD_fp = f"{misato_dir}/MD.hdf5"
train = f'{misato_dir}/train_MD.txt'
pro_test_tiny = ProtDataset(mdh5_file, idx_file=train_idx, transform=GNNTransformMD(), post_transform=T.RandomJitter(0.05))
pro_test = ProtDataset(MD_fp, idx_file=train, transform=GNNTransformMD(), post_transform=T.RandomJitter(0.05))
sample = pro_test.f[p_id.upper()]
#%%
test_loader = DataLoader(pro_test_tiny, batch_size=1, num_workers=16)
for sample_loader in test_loader: break
#%%
seq = [i_to_res[i] for i in sample['atoms_residue']]
atom_type = [i_to_type[i] for i in sample['atoms_type']] # convert atom_type from int to string representation
print("CA count:", atom_type.count('CX')) # count of CA
#%% GET REAL
c = Chain(f"{HOME}/projects/data/v2020-other-PL/{p_id}/{p_id}_protein.pdb")
print(len(c), c)
c = Chain(f"{HOME}/projects/data/v2020-other-PL/{p_id}/{p_id}_pocket.pdb")
print(len(c), c)
#%%
from prody import parsePDB
p_id = '2G6P'.lower()
pdb = parsePDB(f"{HOME}/projects/data/v2020-other-PL/{p_id}/{p_id}_protein.pdb", subset='ca')
pdb
#%%
There are some issuse with the MISATO dataset that should be resolved before I can readily use it (see https://github.com/t7morgen/misato-dataset/issues/7).
CODE