jyaacoub commented 4 months ago

In preparation for #77 we need confirmations to input to RING in order to get frequency counts. However, for some PDBbind proteins we have missing confirmations.

jyaacoub commented 4 months ago

Renamed previous conf to use pid so that we can identify which unique proteins are left: missing pdbs can be found on h4h in ~/projects/MutDTA/missing_pdbbind_confs.txt

code:

import pandas as pd
from glob import glob
from tqdm import tqdm
import logging
import os

df = pd.read_csv('/cluster/home/t122995uhn/projects/data/PDBbindDataset/nomsa_binary_original_binary/full/XY.csv', index_col=0)
# renaming confirmations to use pid instead of PDB code

def get_pid(df, code):
    if df is not None and code in df.index:
        return df.loc[code]['prot_id']
    return code

def af_conf_files(dir, pid):
    return glob(f'{dir}/{pid}_model_*.pdb') 

dir_p = '/cluster/home/t122995uhn/projects/data/PDBbind_afConf/'

df_unique = df[['prot_id']].drop_duplicates()

missing = []
for idx, (pid,) in df_unique[['prot_id']].iterrows():
    af_confs_idx = af_conf_files(dir_p, idx)

    if len(af_confs_idx) == 0:
        logging.debug(f"no confs for {idx}")
        missing.append(idx)
        continue

    for f in af_confs_idx:
        new_f = f.replace(idx, pid)
        # logging.info(f'{f} -> {new_f}')
        os.rename(f, new_f)

    # break

    # af_confs_pid = af_conf_files(dir_p, pid)

jyaacoub commented 4 months ago

Below is code to get the number of mismatched, missing, and safe codes:

from src.data_prep.datasets import PDBbindProcessor, BaseDataset
import pandas as pd
DATA_ROOT = '/cluster/home/t122995uhn/projects/data/v2020-other-PL/' 
AF_CONF_DIR = '/cluster/home/t122995uhn/projects/data/PDBbind_afConf/'
csv_fp = '/cluster/home/t122995uhn/projects/data/PDBbindDataset/nomsa_ring3_original_binary/full/XY.csv'
df = pd.read_csv(csv_fp, index_col=0)
df_unique = BaseDataset.get_unique_prots(df)

# %%
import os
from glob import glob
def pdb_p(code):
    return os.path.join(DATA_ROOT, code, f'{code}_protein.pdb')

def af_conf_files(pid) -> list[str]:
    return glob(f'{AF_CONF_DIR}/{pid}_model_*.pdb')

from src.utils.residue import Chain
from tqdm import tqdm
import logging
logging.getLogger().setLevel(logging.INFO)
mismatched = {}
missing = {}
safe = {}
for code, (pid, seq) in tqdm(df_unique[['prot_id', 'prot_seq']].iterrows(),
                             total=len(df_unique)):
    afs = af_conf_files(pid)

    if len(afs) == 0:
        logging.debug(f'Missing confs for {pid}')
        missing[code] = pid
        continue

    af_seq = Chain(afs[0]).sequence
    if seq != af_seq:
        logging.debug(f'Mismatched sequence for {pid}')
        mismatched[code] = (pid, seq, af_seq)
        continue

    safe[code] = pid

for code, pid in safe.items():
    s0 = Chain(pdb_p(code)).sequence
    s1 = Chain(af_conf_files(pid)[0]).sequence

    assert s0 == s1, f"mismatch for af conf and pdb on ({code}, {pid})"

# saving to output
with open('mismatched_codes.csv', 'w') as f:
    f.write('code,pid,seq,af_seq\n')
    for code, (pid, seq, af_seq) in mismatched.items():
        f.write(f'{code},{pid},{seq},{af_seq}\n')

with open('missing_codes.csv', 'w') as f:
    f.write('code,pid\n')
    for code, pid in missing.items():
        f.write(f'{code},{pid}\n')

with open('safe_codes.csv', 'w') as f:
    f.write('code,pid\n')
    for code, pid in safe.items():
        f.write(f'{code},{pid}\n')

Mismatched prots: 860

Missing prots: 2743

Safe prots: 191

import pandas as pd
df_mm = pd.read_csv('mismatched_codes.csv', index_col=0)
df_m = pd.read_csv('missing_codes.csv', index_col=0)
df_s = pd.read_csv('safe_codes.csv', index_col=0)

Any missing input files? (a3m alignment file)

For missing prots we are only missing 1 MSA file

import os
from tqdm import tqdm
msa = lambda c: f'/cluster/projects/kumargroup/msa/output/{c}.msa.a3m'

no_msa = []
for code in df_mm.index:
    if not os.path.exists(msa(code)):
        no_msa.append(code)

from above: no_msa = ['4jmx']

For mismatched prots we are not missing any MSAs

For safe prots obviously we are not missing any MSAs...

jyaacoub commented 4 months ago

Moving and splitting msas into 10 parts

copy unique prots

# msa files are index by pdb code
import os, shutil
from tqdm import tqdm

df_mm = pd.read_csv('mismatched_codes_v2.csv', index_col=0)
df_m = pd.read_csv('missing_codes_v2.csv', index_col=0)

msa = lambda c: f'/cluster/projects/kumargroup/msa/output/{c}.msa.a3m'
codes = list(df_mm.index) + list(df_m.index)
out_dir = '/cluster/home/t122995uhn/projects/data/pdbbind/a3m_unique_prot/all/'

for code in tqdm(codes):
    fp = msa(code)
    out_p = f"{out_dir}/{code}.a3m"
    if code == "4jmx": continue
    shutil.copyfile(fp, out_p)

split into 10 parts

#!/bin/bash

# Number of splits provided as argument
num_splits=10

# Define the source directory containing your .a3m input files
src_dir="/cluster/home/t122995uhn/projects/data/pdbbind/a3m_unique_prot/all/"
ext=".a3m"
 # Get the parent directory of the source directory
parent_dir=$(dirname "$src_dir")

# Create an array to hold destination directories
declare -a dest_dirs=()

# Create destination directories based on the number of splits
for ((i=1; i<=$num_splits; i++)); do
    dest_dirs+=("$parent_dir/part$i/")
done

# Make sure the destination directories exist
for dir in "${dest_dirs[@]}"; do
    mkdir -p "$dir"
done

# Calculate the total number of files in the source directory
total_files=$(ls "$src_dir"/*"$ext" | wc -l)

echo "Total files:" $total_files

# Calculate the number of files in each part
files_per_part=$((total_files / num_splits))

# Use a counter to keep track of the files processed
file_counter=0

# Loop through the source directory and move files to destination directories
for file in "$src_dir"/*"$ext"; do
    part=$((file_counter / files_per_part + 1))
    dest_dir="${dest_dirs[$part - 1]}"
    ln -s "$file" "$dest_dir/$(basename "$file" "$ext").a3m"
    file_counter=$((file_counter + 1))
done

echo "Splitting complete: " $file_counter

# failed:
# ln: failed to create symbolic link ‘/6v1c.a3m’: Permission denied
# ln: failed to create symbolic link ‘/8gpb.a3m’: Permission denied
# ln: failed to create symbolic link ‘/9icd.a3m’: Permission denied
# ln -s /cluster/home/$me/projects/data/pdbbind/a3m_unique_prot/all/6v1c.a3m /cluster/home/$me/projects/data/pdbbind/a3m_unique_prot/part10

jyaacoub commented 4 months ago

Rerunning af2_confs and combine

Combined old "safe" confs with new updated confs:


# %% Checking XY.csv and regenerate mismatches
from src.data_prep.processors import PDBbindProcessor
from src.data_prep.datasets import BaseDataset
from src.data_prep.feature_extraction.protein import multi_get_sequences
from tqdm import tqdm
import pandas as pd
import os
from glob import glob

DATA_ROOT   = '/cluster/home/t122995uhn/projects/data/pdbbind/v2020-other-PL/' 
AF_CONF_DIR = '/cluster/home/t122995uhn/projects/data/pdbbind/PDBbind_afConf/'
csv_fp      = '/cluster/home/t122995uhn/projects/data/PDBbindDataset/nomsa_ring3_original_binary/full/XY.csv'

def pdb_p(code):
    return os.path.join(DATA_ROOT, code, f'{code}_protein.pdb')
def af_conf_files(pid) -> list[str]:
    return glob(f'{AF_CONF_DIR}/{pid}_model_*.pdb')
def cmap_p(code):
    return os.path.join(DATA_ROOT, 'cmaps', f'{code}.npy')

df = pd.read_csv(csv_fp, index_col=0)
df_unique = BaseDataset.get_unique_prots(df)

# %% create symbolic link of missing files into new folder
REMAIN_DIR = '/cluster/home/t122995uhn/projects/data/pdbbind/a3m_unique_prot/remainder/'
AF2_OUT_DIR = '/cluster/home/t122995uhn/projects/data/pdbbind/pdbbind_af2_out/'
ALL_LN_DIR = '/cluster/home/t122995uhn/projects/data/pdbbind/pdbbind_af2_out/all_ln/'

missing = {}
for code, (pid, seq) in tqdm(df_unique[['prot_id', 'prot_seq']].iterrows(),
                             total=len(df_unique)):
    # Check main dir for code
    confs = glob(f'{AF2_OUT_DIR}/out*/{code}*_model_*.pdb')

    if len(confs) == 0:
        # fallback to old_outs if not found
        confs = glob(f'{AF2_OUT_DIR}/old_outs/out*/{pid}*_model_*.pdb')

    if len(confs) == 0:
        missing[code] = pid
        continue

    # create a symlink in the all_ln dir
    for i, c in enumerate(confs):
        os.symlink(c, os.path.join(ALL_LN_DIR, f'{pid}_model_{i}.pdb'))
    missing = {}

# %%
still_missing = {}
for code, pid in tqdm(missing.items()):
    confs = glob(f'{AF2_OUT_DIR}/old_outs/out*/{code}*_model_*.pdb')

    if len(confs) == 0:
        still_missing[code] = pid
        continue

    # create a symlink in the all_ln dir
    for i, c in enumerate(confs):
        os.symlink(c, os.path.join(ALL_LN_DIR, f'{pid}_model_{i}.pdb'))

jyaacoub commented 4 months ago

Issue is now resolved, this was a big pain in the ass to debug and I need to figure out a way better approach to issues like this.

This is where unit testing would be useful, but I am simply just a man and can't do this all in a reasonable time frame.

jyaacoub / MutDTA

Missing AF2 Confirmations for PDBbind #80

code:

Mismatched prots: 860

Missing prots: 2743

Safe prots: 191

Any missing input files? (a3m alignment file)

For missing prots we are only missing 1 MSA file

For mismatched prots we are not missing any MSAs

For safe prots obviously we are not missing any MSAs...

Moving and splitting msas into 10 parts

copy unique prots

split into 10 parts

Rerunning af2_confs and combine