Closed jyaacoub closed 6 months ago
Ring edge features are labeled as the following:
array(['HBOND:SC_MC', 'VDW:SC_SC', 'HBOND:MC_MC', 'VDW:MC_SC',
'HBOND:MC_SC', 'PIPISTACK:SC_SC', 'VDW:SC_MC', 'PICATION:SC_SC',
'HBOND:SC_SC', 'IONIC:SC_SC', 'VDW:MC_MC']
Note that SC_SC means sidechain-sidechain and MC_MC means mainchain-mainchain and so on for MC_SC and SC_MC
Translating this to the main features we want to use:
- H-Bond -> HBOND
- Pi-Pi stack -> PIPISTACK
- Pi-Cation -> PICATION
- Ionic -> IONIC
- Van der Waals -> VDW
- Pi-H bond -> ????
- Contact distance -> From raw PDB file (see: 'simple' edge features)
And to get the frequencies across multiple models/confirmations we can add the --md
flag to the end of the ring command:
RING - Molecular Dynamics (MD) is a RING module that computes aggregated statistics over multi-state structure files like NMR structural ensembles or molecular dynamics snapshots (provided as different models in a PDB/mmCIF file).
The module can be executed by adding the --md
flag:
ring -i 2m6z.cif --out_dir results --md
It generates the standard output files inside the results/ folder and
creates a md
subdirectory with four different types of data:
<fileName>_cm_<type>
, all the contact maps, one per model. The first column is the model number<fileName>_gcm_<type>
, the global contact map, number of contacts across models <fileName>_gfreq_<type>
, how many times each node is in contact over all models <fileName>_tdcm_<type>
, how many contacts for each node (rows) and each model (columns)Where <type>
is the type of interaction: HBOND, IAC, IONIC, PICATION, PIPISTACK, SSBOND, VDW
<fileName>_gfreq_<type>
Recall the features we want:
- H-Bond -> HBOND
- Pi-Pi stack -> PIPISTACK
- Pi-Cation -> PICATION
- Ionic -> IONIC
- Van der Waals -> VDW
- Contact distance -> From raw PDB file (see: 'simple' edge features)
#%%
from glob import glob
from pathlib import Path
from src.utils.residue import Ring3Runner
import os
import logging
logging.getLogger().setLevel(logging.INFO)
# %%
pdb_7lqt = f'{Path.home()}/projects/data/misc/7LQT.pdb'
af_conf_dir = f'{Path.home()}/projects/data/misc/'
af_confs_EGFR = glob(f'{af_conf_dir}/EGFR*/EGFR_unrelaxed_rank_*.pdb')
#%%
from src.utils.residue import Chain
import matplotlib.pyplot as plt
import numpy as np
opt = af_confs_EGFR
opt = pdb_7lqt
thr = 8.0
for opt in [af_confs_EGFR, pdb_7lqt]:
# get distance contact map
if opt is af_confs_EGFR:
chains = [Chain(p) for p in opt]
M = np.array([c.get_contact_map() for c in chains]) < thr
dist_cmap = np.sum(M, axis=0) / len(M)
else:
dist_cmap = Chain(opt).get_contact_map() < thr
# ring3 edge attribute extraction
# Note: this will create a "combined" pdb file in the same directory as the confirmaions
input_pdb, files = Ring3Runner.run(opt, overwrite=True)
seq_len = len(Chain(input_pdb))
# Converts output files into LxLx6 matrix for the 6 ring3 edge attributes
r3_cmaps = []
for k, fp in files.items():
cmap = Ring3Runner.build_cmap(fp, seq_len)
r3_cmaps.append(cmap)
# COMBINE convert to numpy array
# plot all 6 cmaps
fig, axs = plt.subplots(2,3, figsize=(15,10))
ks = list(files.keys()) + ['dist']
for i, cmap in enumerate(r3_cmaps + [dist_cmap]):
ax = axs[i//3, i%3]
ax.matshow(cmap)
ax.set_title(ks[i])
plt.suptitle(f'Ring3 Edge Attributes for {"EGFR" if opt is af_confs_EGFR else "7LQT"}')
plt.show()
Sample test works well:
# %%
from pathlib import Path
from glob import glob
from src.utils.residue import Chain
from src.data_prep.feature_extraction.protein_edges import get_target_edge_weights
from src import config as cfg
import numpy as np
from torch_geometric.data import Data
from torch_geometric.nn import TransformerConv
import torch
af_conf_dir = f'{Path.home()}/projects/data/misc/'
af_confs_EGFR = glob(f'{af_conf_dir}/EGFR*/EGFR_unrelaxed_rank_*00.pdb')
target = Chain(af_confs_EGFR[0])
L = len(target)
x = torch.rand(L, 2, dtype=torch.float32) # [N, feat_dim]
# %% get edge information
dist_cmap = target.get_contact_map() < 8.0
ei = torch.tensor(np.tril(dist_cmap)).nonzero().T # [2, E]
ea = get_target_edge_weights('', target.sequence,
edge_opt=cfg.EDGE_OPT.ring3.value,
af_confs=af_confs_EGFR)
# using only the first cmap to determine which edge values to use
ea = torch.Tensor(ea[ei[0], ei[1], :]) # [E, 6]
sample_data = Data(x=x, edge_index=ei,edge_attr=ea)
#%%
model = TransformerConv(in_channels=2, out_channels=2, heads=1,
edge_dim=6) # 6 edge attributes
model(sample_data.x, sample_data.edge_index, sample_data.edge_attr).shape
For AlphaFlow confirmations we can run ring3 as they are being generated:
#%%
from src.data_prep.datasets import BaseDataset
import pandas as pd
csv_p = "/cluster/home/t122995uhn/projects/data/PDBbindDataset/nomsa_ring3_original_binary/full/XY.csv"
df = pd.read_csv(csv_p, index_col=0)
df_unique = BaseDataset.get_unique_prots(df)
# %%
import os
from tqdm import tqdm
alphaflow_dir = "/cluster/home/t122995uhn/projects/data/pdbbind/alphaflow_io/out_pdb_MD-distilled/"
ln_dir = "/cluster/home/t122995uhn/projects/data/pdbbind/alphaflow_io/out_pid_ln/"
os.makedirs(ln_dir, exist_ok=True)
# files are .pdb with 50 "models" in each
for file in tqdm(os.listdir(alphaflow_dir)):
if not file.endswith('.pdb'):
continue
code, _ = os.path.splitext(file)
pid = df_unique.loc[code].prot_id
os.symlink(f"{alphaflow_dir}/{file}",
f"{ln_dir}/{pid}.pdb")
# %% RUN RING3
# %% Run RING3 on finished confirmations from AlphaFlow
from src.utils.residue import Ring3Runner
files = [os.path.join(ln_dir, f) for f in \
os.listdir(ln_dir) if f.endswith('.pdb')]
Ring3Runner.run_multiprocess(pdb_fps=files)
For this I will need to use a different GNN that accepts edge_attr like
transformerConv
belowCode