jyaacoub / MutDTA

Improving the precision oncology pipeline by providing binding affinity purtubations predictions on a pirori identified cancer driver genes.
1 stars 2 forks source link

Missing AF2 Confirmations for PDBbind #80

Closed jyaacoub closed 4 months ago

jyaacoub commented 4 months ago

In preparation for #77 we need confirmations to input to RING in order to get frequency counts. However, for some PDBbind proteins we have missing confirmations.

jyaacoub commented 4 months ago

Renamed previous conf to use pid so that we can identify which unique proteins are left: missing pdbs can be found on h4h in ~/projects/MutDTA/missing_pdbbind_confs.txt

code:

import pandas as pd
from glob import glob
from tqdm import tqdm
import logging
import os

df = pd.read_csv('/cluster/home/t122995uhn/projects/data/PDBbindDataset/nomsa_binary_original_binary/full/XY.csv', index_col=0)
# renaming confirmations to use pid instead of PDB code

def get_pid(df, code):
    if df is not None and code in df.index:
        return df.loc[code]['prot_id']
    return code

def af_conf_files(dir, pid):
    return glob(f'{dir}/{pid}_model_*.pdb') 

dir_p = '/cluster/home/t122995uhn/projects/data/PDBbind_afConf/'

df_unique = df[['prot_id']].drop_duplicates()

missing = []
for idx, (pid,) in df_unique[['prot_id']].iterrows():
    af_confs_idx = af_conf_files(dir_p, idx)

    if len(af_confs_idx) == 0:
        logging.debug(f"no confs for {idx}")
        missing.append(idx)
        continue

    for f in af_confs_idx:
        new_f = f.replace(idx, pid)
        # logging.info(f'{f} -> {new_f}')
        os.rename(f, new_f)

    # break

    # af_confs_pid = af_conf_files(dir_p, pid)
jyaacoub commented 4 months ago

Below is code to get the number of mismatched, missing, and safe codes:

from src.data_prep.datasets import PDBbindProcessor, BaseDataset
import pandas as pd
DATA_ROOT = '/cluster/home/t122995uhn/projects/data/v2020-other-PL/' 
AF_CONF_DIR = '/cluster/home/t122995uhn/projects/data/PDBbind_afConf/'
csv_fp = '/cluster/home/t122995uhn/projects/data/PDBbindDataset/nomsa_ring3_original_binary/full/XY.csv'
df = pd.read_csv(csv_fp, index_col=0)
df_unique = BaseDataset.get_unique_prots(df)

# %%
import os
from glob import glob
def pdb_p(code):
    return os.path.join(DATA_ROOT, code, f'{code}_protein.pdb')

def af_conf_files(pid) -> list[str]:
    return glob(f'{AF_CONF_DIR}/{pid}_model_*.pdb')

from src.utils.residue import Chain
from tqdm import tqdm
import logging
logging.getLogger().setLevel(logging.INFO)
mismatched = {}
missing = {}
safe = {}
for code, (pid, seq) in tqdm(df_unique[['prot_id', 'prot_seq']].iterrows(),
                             total=len(df_unique)):
    afs = af_conf_files(pid)

    if len(afs) == 0:
        logging.debug(f'Missing confs for {pid}')
        missing[code] = pid
        continue

    af_seq = Chain(afs[0]).sequence
    if seq != af_seq:
        logging.debug(f'Mismatched sequence for {pid}')
        mismatched[code] = (pid, seq, af_seq)
        continue

    safe[code] = pid

for code, pid in safe.items():
    s0 = Chain(pdb_p(code)).sequence
    s1 = Chain(af_conf_files(pid)[0]).sequence

    assert s0 == s1, f"mismatch for af conf and pdb on ({code}, {pid})"

# saving to output
with open('mismatched_codes.csv', 'w') as f:
    f.write('code,pid,seq,af_seq\n')
    for code, (pid, seq, af_seq) in mismatched.items():
        f.write(f'{code},{pid},{seq},{af_seq}\n')

with open('missing_codes.csv', 'w') as f:
    f.write('code,pid\n')
    for code, pid in missing.items():
        f.write(f'{code},{pid}\n')

with open('safe_codes.csv', 'w') as f:
    f.write('code,pid\n')
    for code, pid in safe.items():
        f.write(f'{code},{pid}\n')

Mismatched prots: 860

Missing prots: 2743

Safe prots: 191

import pandas as pd
df_mm = pd.read_csv('mismatched_codes.csv', index_col=0)
df_m = pd.read_csv('missing_codes.csv', index_col=0)
df_s = pd.read_csv('safe_codes.csv', index_col=0)

Any missing input files? (a3m alignment file)

For missing prots we are only missing 1 MSA file

import os
from tqdm import tqdm
msa = lambda c: f'/cluster/projects/kumargroup/msa/output/{c}.msa.a3m'

no_msa = []
for code in df_mm.index:
    if not os.path.exists(msa(code)):
        no_msa.append(code)

from above: no_msa = ['4jmx']

For mismatched prots we are not missing any MSAs

For safe prots obviously we are not missing any MSAs...

jyaacoub commented 4 months ago

Moving and splitting msas into 10 parts

copy unique prots

# msa files are index by pdb code
import os, shutil
from tqdm import tqdm

df_mm = pd.read_csv('mismatched_codes_v2.csv', index_col=0)
df_m = pd.read_csv('missing_codes_v2.csv', index_col=0)

msa = lambda c: f'/cluster/projects/kumargroup/msa/output/{c}.msa.a3m'
codes = list(df_mm.index) + list(df_m.index)
out_dir = '/cluster/home/t122995uhn/projects/data/pdbbind/a3m_unique_prot/all/'

for code in tqdm(codes):
    fp = msa(code)
    out_p = f"{out_dir}/{code}.a3m"
    if code == "4jmx": continue
    shutil.copyfile(fp, out_p)

split into 10 parts

#!/bin/bash

# Number of splits provided as argument
num_splits=10

# Define the source directory containing your .a3m input files
src_dir="/cluster/home/t122995uhn/projects/data/pdbbind/a3m_unique_prot/all/"
ext=".a3m"
 # Get the parent directory of the source directory
parent_dir=$(dirname "$src_dir")

# Create an array to hold destination directories
declare -a dest_dirs=()

# Create destination directories based on the number of splits
for ((i=1; i<=$num_splits; i++)); do
    dest_dirs+=("$parent_dir/part$i/")
done

# Make sure the destination directories exist
for dir in "${dest_dirs[@]}"; do
    mkdir -p "$dir"
done

# Calculate the total number of files in the source directory
total_files=$(ls "$src_dir"/*"$ext" | wc -l)

echo "Total files:" $total_files

# Calculate the number of files in each part
files_per_part=$((total_files / num_splits))

# Use a counter to keep track of the files processed
file_counter=0

# Loop through the source directory and move files to destination directories
for file in "$src_dir"/*"$ext"; do
    part=$((file_counter / files_per_part + 1))
    dest_dir="${dest_dirs[$part - 1]}"
    ln -s "$file" "$dest_dir/$(basename "$file" "$ext").a3m"
    file_counter=$((file_counter + 1))
done

echo "Splitting complete: " $file_counter

# failed:
# ln: failed to create symbolic link ‘/6v1c.a3m’: Permission denied
# ln: failed to create symbolic link ‘/8gpb.a3m’: Permission denied
# ln: failed to create symbolic link ‘/9icd.a3m’: Permission denied
# ln -s /cluster/home/$me/projects/data/pdbbind/a3m_unique_prot/all/6v1c.a3m /cluster/home/$me/projects/data/pdbbind/a3m_unique_prot/part10
jyaacoub commented 4 months ago

Rerunning af2_confs and combine

Combined old "safe" confs with new updated confs:


# %% Checking XY.csv and regenerate mismatches
from src.data_prep.processors import PDBbindProcessor
from src.data_prep.datasets import BaseDataset
from src.data_prep.feature_extraction.protein import multi_get_sequences
from tqdm import tqdm
import pandas as pd
import os
from glob import glob

DATA_ROOT   = '/cluster/home/t122995uhn/projects/data/pdbbind/v2020-other-PL/' 
AF_CONF_DIR = '/cluster/home/t122995uhn/projects/data/pdbbind/PDBbind_afConf/'
csv_fp      = '/cluster/home/t122995uhn/projects/data/PDBbindDataset/nomsa_ring3_original_binary/full/XY.csv'

def pdb_p(code):
    return os.path.join(DATA_ROOT, code, f'{code}_protein.pdb')
def af_conf_files(pid) -> list[str]:
    return glob(f'{AF_CONF_DIR}/{pid}_model_*.pdb')
def cmap_p(code):
    return os.path.join(DATA_ROOT, 'cmaps', f'{code}.npy')

df = pd.read_csv(csv_fp, index_col=0)
df_unique = BaseDataset.get_unique_prots(df)

# %% create symbolic link of missing files into new folder
REMAIN_DIR = '/cluster/home/t122995uhn/projects/data/pdbbind/a3m_unique_prot/remainder/'
AF2_OUT_DIR = '/cluster/home/t122995uhn/projects/data/pdbbind/pdbbind_af2_out/'
ALL_LN_DIR = '/cluster/home/t122995uhn/projects/data/pdbbind/pdbbind_af2_out/all_ln/'

missing = {}
for code, (pid, seq) in tqdm(df_unique[['prot_id', 'prot_seq']].iterrows(),
                             total=len(df_unique)):
    # Check main dir for code
    confs = glob(f'{AF2_OUT_DIR}/out*/{code}*_model_*.pdb')

    if len(confs) == 0:
        # fallback to old_outs if not found
        confs = glob(f'{AF2_OUT_DIR}/old_outs/out*/{pid}*_model_*.pdb')

    if len(confs) == 0:
        missing[code] = pid
        continue

    # create a symlink in the all_ln dir
    for i, c in enumerate(confs):
        os.symlink(c, os.path.join(ALL_LN_DIR, f'{pid}_model_{i}.pdb'))
    missing = {}

# %%
still_missing = {}
for code, pid in tqdm(missing.items()):
    confs = glob(f'{AF2_OUT_DIR}/old_outs/out*/{code}*_model_*.pdb')

    if len(confs) == 0:
        still_missing[code] = pid
        continue

    # create a symlink in the all_ln dir
    for i, c in enumerate(confs):
        os.symlink(c, os.path.join(ALL_LN_DIR, f'{pid}_model_{i}.pdb'))
jyaacoub commented 4 months ago

Issue is now resolved, this was a big pain in the ass to debug and I need to figure out a way better approach to issues like this.

This is where unit testing would be useful, but I am simply just a man and can't do this all in a reasonable time frame.