Closed jyaacoub closed 11 months ago
Checking all pdb files for the correct sequence reduces the number of misses but we still have some that are missing:
import json, os
import pandas as pd
from concurrent.futures import ThreadPoolExecutor
from tqdm.contrib.concurrent import thread_map # Use tqdm for multithreading
from src.data_processing.downloaders import Downloader
root_dir = '/cluster/home/t122995uhn/projects/data/kiba_tmp'
save_dir = f'{root_dir}/structures'
# Contains protein sequences mapped to uniprotIDs
unique_prots = json.load(open(f'{root_dir}/proteins.txt', 'r'))
# [...] send to https://www.uniprot.org/id-mapping to get associated pdb files
# this returns a tsv containing all matching pdbs for each unique uniprotID
df = pd.read_csv(f'{root_dir}/kiba_mapping_pdb.tsv', sep='\t')
#%% Downloading pdbs
def download_pdb(pdb_id):
try:
Downloader.download_PDBs([pdb_id], save_dir=save_dir, tqdm_disable=True)
return pdb_id # Return the downloaded PDB ID for progress tracking
except Exception as e:
print(f"Error downloading {pdb_id}: {str(e)}")
# Get unique PDBs
pdbs = df['To'].unique()
# Number of concurrent threads (adjust as needed)
num_threads = 4
# Use tqdm with ThreadPoolExecutor
with ThreadPoolExecutor(max_workers=num_threads) as executor:
# Use thread_map for tqdm integration
downloaded_pdbs = list(thread_map(download_pdb, pdbs, desc="Downloading PDBs", total=len(pdbs)))
print("DONE.")
#%%
import json, os
import pandas as pd
from src.utils.residue import Chain
from tqdm import tqdm
from prody import parsePDB
root_dir = '/cluster/home/t122995uhn/projects/data/kiba_tmp'
save_dir = f'{root_dir}/structures'
pdb_fp = lambda x: f'{save_dir}/{x}.pdb'
# Contains protein sequences mapped to uniprotIDs
unique_prots = json.load(open(f'{root_dir}/proteins.txt', 'r'))
# [...] send to https://www.uniprot.org/id-mapping to get associated pdb files
# this returns a tsv containing all matching pdbs for each unique uniprotID
df = pd.read_csv(f'{root_dir}/kiba_mapping_pdb.tsv', sep='\t')
#%%
matches = {} # tracks matching sequences to pdb structures
fails = []
for i, row in tqdm(df.iterrows(), desc='Matching pdbs', total=len(df)):
uniprot = row['From']
pdb = row['To']
if uniprot in matches: continue # already matched
try:
seq = parsePDB(pdb_fp(pdb), subset='ca').getSequence()
except Exception as e:
fails.append((pdb, e))
# raise Exception(f'Error on {i}, ({uniprot}, {pdb}).') from e
if seq == unique_prots[uniprot]:
matches[uniprot] = pdb
#%% Fails are due to empty pdb files
for c, _ in fails:
os.remove(pdb_fp(c))
Matching on each chain gives only slightly more matches (61), but still a lot are missing...
for i, row in tqdm(df.iterrows(), desc='Matching pdbs', total=len(df)):
uniprot = row['From']
pdb = row['To']
if uniprot in matches: continue # already matched
try:
pdb_s = parsePDB(pdb_fp(pdb), subset='ca')
seq = pdb_s.getSequence() # includes all chains
except Exception as e:
fails.append((pdb, e))
# raise Exception(f'Error on {i}, ({uniprot}, {pdb}).') from e
curr_seq = unique_prots[uniprot]
if (len(seq) == len(curr_seq) and seq == curr_seq) or \
(len(seq) > len(curr_seq) and curr_seq in seq)or \
(len(seq) < len(curr_seq) and seq in curr_seq):
matches[uniprot] = pdb
Solution was to just use High quality generated structures using AlphaFold!
The downloaded kiba structures using UniProtIDs on https://www.uniprot.org/id-mapping failed to work. They map to structures that do not match with the sequences provided in the dataset.
This is an issue since the dataset comes with alignment files and predicted contact maps that depend on that input sequence being the same.
Potential solutions:
(1) is the most ideal but difficult to do (also unknown if the correct structures are even available), (3) is the easiest but might have unforeseeable consequences during training and (2) is a middle ground between both but would require retraining previous models to get updated results and confirmation that the dataset performance has not been altered too much.
Mismatch info:
Pretty much all of the available PDB structures are mismatched.
Even some of the AlphaFold structures are mismatched (2 out of the 35 that were non-hits from uniprot mapping search)
Code to replicate above: