Closed jyaacoub closed 4 months ago
Renamed previous conf to use pid so that we can identify which unique proteins are left: missing pdbs can be found on h4h in ~/projects/MutDTA/missing_pdbbind_confs.txt
import pandas as pd
from glob import glob
from tqdm import tqdm
import logging
import os
df = pd.read_csv('/cluster/home/t122995uhn/projects/data/PDBbindDataset/nomsa_binary_original_binary/full/XY.csv', index_col=0)
# renaming confirmations to use pid instead of PDB code
def get_pid(df, code):
if df is not None and code in df.index:
return df.loc[code]['prot_id']
return code
def af_conf_files(dir, pid):
return glob(f'{dir}/{pid}_model_*.pdb')
dir_p = '/cluster/home/t122995uhn/projects/data/PDBbind_afConf/'
df_unique = df[['prot_id']].drop_duplicates()
missing = []
for idx, (pid,) in df_unique[['prot_id']].iterrows():
af_confs_idx = af_conf_files(dir_p, idx)
if len(af_confs_idx) == 0:
logging.debug(f"no confs for {idx}")
missing.append(idx)
continue
for f in af_confs_idx:
new_f = f.replace(idx, pid)
# logging.info(f'{f} -> {new_f}')
os.rename(f, new_f)
# break
# af_confs_pid = af_conf_files(dir_p, pid)
Below is code to get the number of mismatched, missing, and safe codes:
from src.data_prep.datasets import PDBbindProcessor, BaseDataset
import pandas as pd
DATA_ROOT = '/cluster/home/t122995uhn/projects/data/v2020-other-PL/'
AF_CONF_DIR = '/cluster/home/t122995uhn/projects/data/PDBbind_afConf/'
csv_fp = '/cluster/home/t122995uhn/projects/data/PDBbindDataset/nomsa_ring3_original_binary/full/XY.csv'
df = pd.read_csv(csv_fp, index_col=0)
df_unique = BaseDataset.get_unique_prots(df)
# %%
import os
from glob import glob
def pdb_p(code):
return os.path.join(DATA_ROOT, code, f'{code}_protein.pdb')
def af_conf_files(pid) -> list[str]:
return glob(f'{AF_CONF_DIR}/{pid}_model_*.pdb')
from src.utils.residue import Chain
from tqdm import tqdm
import logging
logging.getLogger().setLevel(logging.INFO)
mismatched = {}
missing = {}
safe = {}
for code, (pid, seq) in tqdm(df_unique[['prot_id', 'prot_seq']].iterrows(),
total=len(df_unique)):
afs = af_conf_files(pid)
if len(afs) == 0:
logging.debug(f'Missing confs for {pid}')
missing[code] = pid
continue
af_seq = Chain(afs[0]).sequence
if seq != af_seq:
logging.debug(f'Mismatched sequence for {pid}')
mismatched[code] = (pid, seq, af_seq)
continue
safe[code] = pid
for code, pid in safe.items():
s0 = Chain(pdb_p(code)).sequence
s1 = Chain(af_conf_files(pid)[0]).sequence
assert s0 == s1, f"mismatch for af conf and pdb on ({code}, {pid})"
# saving to output
with open('mismatched_codes.csv', 'w') as f:
f.write('code,pid,seq,af_seq\n')
for code, (pid, seq, af_seq) in mismatched.items():
f.write(f'{code},{pid},{seq},{af_seq}\n')
with open('missing_codes.csv', 'w') as f:
f.write('code,pid\n')
for code, pid in missing.items():
f.write(f'{code},{pid}\n')
with open('safe_codes.csv', 'w') as f:
f.write('code,pid\n')
for code, pid in safe.items():
f.write(f'{code},{pid}\n')
import pandas as pd
df_mm = pd.read_csv('mismatched_codes.csv', index_col=0)
df_m = pd.read_csv('missing_codes.csv', index_col=0)
df_s = pd.read_csv('safe_codes.csv', index_col=0)
import os
from tqdm import tqdm
msa = lambda c: f'/cluster/projects/kumargroup/msa/output/{c}.msa.a3m'
no_msa = []
for code in df_mm.index:
if not os.path.exists(msa(code)):
no_msa.append(code)
from above: no_msa = ['4jmx']
# msa files are index by pdb code
import os, shutil
from tqdm import tqdm
df_mm = pd.read_csv('mismatched_codes_v2.csv', index_col=0)
df_m = pd.read_csv('missing_codes_v2.csv', index_col=0)
msa = lambda c: f'/cluster/projects/kumargroup/msa/output/{c}.msa.a3m'
codes = list(df_mm.index) + list(df_m.index)
out_dir = '/cluster/home/t122995uhn/projects/data/pdbbind/a3m_unique_prot/all/'
for code in tqdm(codes):
fp = msa(code)
out_p = f"{out_dir}/{code}.a3m"
if code == "4jmx": continue
shutil.copyfile(fp, out_p)
#!/bin/bash
# Number of splits provided as argument
num_splits=10
# Define the source directory containing your .a3m input files
src_dir="/cluster/home/t122995uhn/projects/data/pdbbind/a3m_unique_prot/all/"
ext=".a3m"
# Get the parent directory of the source directory
parent_dir=$(dirname "$src_dir")
# Create an array to hold destination directories
declare -a dest_dirs=()
# Create destination directories based on the number of splits
for ((i=1; i<=$num_splits; i++)); do
dest_dirs+=("$parent_dir/part$i/")
done
# Make sure the destination directories exist
for dir in "${dest_dirs[@]}"; do
mkdir -p "$dir"
done
# Calculate the total number of files in the source directory
total_files=$(ls "$src_dir"/*"$ext" | wc -l)
echo "Total files:" $total_files
# Calculate the number of files in each part
files_per_part=$((total_files / num_splits))
# Use a counter to keep track of the files processed
file_counter=0
# Loop through the source directory and move files to destination directories
for file in "$src_dir"/*"$ext"; do
part=$((file_counter / files_per_part + 1))
dest_dir="${dest_dirs[$part - 1]}"
ln -s "$file" "$dest_dir/$(basename "$file" "$ext").a3m"
file_counter=$((file_counter + 1))
done
echo "Splitting complete: " $file_counter
# failed:
# ln: failed to create symbolic link ‘/6v1c.a3m’: Permission denied
# ln: failed to create symbolic link ‘/8gpb.a3m’: Permission denied
# ln: failed to create symbolic link ‘/9icd.a3m’: Permission denied
# ln -s /cluster/home/$me/projects/data/pdbbind/a3m_unique_prot/all/6v1c.a3m /cluster/home/$me/projects/data/pdbbind/a3m_unique_prot/part10
Combined old "safe" confs with new updated confs:
# %% Checking XY.csv and regenerate mismatches
from src.data_prep.processors import PDBbindProcessor
from src.data_prep.datasets import BaseDataset
from src.data_prep.feature_extraction.protein import multi_get_sequences
from tqdm import tqdm
import pandas as pd
import os
from glob import glob
DATA_ROOT = '/cluster/home/t122995uhn/projects/data/pdbbind/v2020-other-PL/'
AF_CONF_DIR = '/cluster/home/t122995uhn/projects/data/pdbbind/PDBbind_afConf/'
csv_fp = '/cluster/home/t122995uhn/projects/data/PDBbindDataset/nomsa_ring3_original_binary/full/XY.csv'
def pdb_p(code):
return os.path.join(DATA_ROOT, code, f'{code}_protein.pdb')
def af_conf_files(pid) -> list[str]:
return glob(f'{AF_CONF_DIR}/{pid}_model_*.pdb')
def cmap_p(code):
return os.path.join(DATA_ROOT, 'cmaps', f'{code}.npy')
df = pd.read_csv(csv_fp, index_col=0)
df_unique = BaseDataset.get_unique_prots(df)
# %% create symbolic link of missing files into new folder
REMAIN_DIR = '/cluster/home/t122995uhn/projects/data/pdbbind/a3m_unique_prot/remainder/'
AF2_OUT_DIR = '/cluster/home/t122995uhn/projects/data/pdbbind/pdbbind_af2_out/'
ALL_LN_DIR = '/cluster/home/t122995uhn/projects/data/pdbbind/pdbbind_af2_out/all_ln/'
missing = {}
for code, (pid, seq) in tqdm(df_unique[['prot_id', 'prot_seq']].iterrows(),
total=len(df_unique)):
# Check main dir for code
confs = glob(f'{AF2_OUT_DIR}/out*/{code}*_model_*.pdb')
if len(confs) == 0:
# fallback to old_outs if not found
confs = glob(f'{AF2_OUT_DIR}/old_outs/out*/{pid}*_model_*.pdb')
if len(confs) == 0:
missing[code] = pid
continue
# create a symlink in the all_ln dir
for i, c in enumerate(confs):
os.symlink(c, os.path.join(ALL_LN_DIR, f'{pid}_model_{i}.pdb'))
missing = {}
# %%
still_missing = {}
for code, pid in tqdm(missing.items()):
confs = glob(f'{AF2_OUT_DIR}/old_outs/out*/{code}*_model_*.pdb')
if len(confs) == 0:
still_missing[code] = pid
continue
# create a symlink in the all_ln dir
for i, c in enumerate(confs):
os.symlink(c, os.path.join(ALL_LN_DIR, f'{pid}_model_{i}.pdb'))
Issue is now resolved, this was a big pain in the ass to debug and I need to figure out a way better approach to issues like this.
This is where unit testing would be useful, but I am simply just a man and can't do this all in a reasonable time frame.
In preparation for #77 we need confirmations to input to RING in order to get frequency counts. However, for some PDBbind proteins we have missing confirmations.