Closed jyaacoub closed 1 month ago
Using both uniprot ID and pdb ID: Filtering by both gene name and drug name is difficult since PDBbind uses shortform names like "NLG" and the oncoKB uses the full name of the drug, and there are multiple aliases for each drug so the shortform name might not match. We would have to check against all possible aliases to ensure that we had the right drug-target pair.
```python #%% import pandas as pd def get_test_oncokbs(train_df=pd.read_csv('/cluster/home/t122995uhn/projects/data/test/PDBbindDataset/nomsa_binary_original_binary/full/cleaned_XY.csv'), oncokb_fp='/cluster/home/t122995uhn/projects/data/tcga/mart_export.tsv', biomart='/cluster/home/t122995uhn/projects/downloads/oncoKB_DrugGenePairList.csv'): #Get gene names for PDBbind dfbm = pd.read_csv(oncokb_fp, sep='\t') dfbm['PDB ID'] = dfbm['PDB ID'].str.lower() train_df.reset_index(names='idx',inplace=True) df_uni = train_df.merge(dfbm, how='inner', left_on='prot_id', right_on='UniProtKB/Swiss-Prot ID') df_pdb = train_df.merge(dfbm, how='inner', left_on='code', right_on='PDB ID') # identifying ovelap with oncokb # df_all will have duplicate entries for entries with multiple gene names... df_all = pd.concat([df_uni, df_pdb]).drop_duplicates(['idx', 'Gene name'])[['idx', 'code', 'Gene name']] dfkb = pd.read_csv(biomart) df_all_kb = df_all.merge(dfkb.drop_duplicates('gene'), left_on='Gene name', right_on='gene', how='inner') trained_genes = set(df_all_kb.gene) #Identify non-trained genes return dfkb[~dfkb['gene'].isin(trained_genes)] train_df = pd.read_csv('/cluster/home/t122995uhn/projects/data/test/PDBbindDataset/nomsa_binary_original_binary/train0/cleaned_XY.csv') val_df = pd.read_csv('/cluster/home/t122995uhn/projects/data/test/PDBbindDataset/nomsa_binary_original_binary/val0/cleaned_XY.csv') train_df = pd.concat([train_df, val_df]) get_test_oncokbs(train_df=train_df) ```
The distribution looks visually different in terms of highly targeted proteins, but when running a similarity scoring algorithmn to deduce the difference in the two distributions (random split vs OncoKB split) there was no real difference, but this could be a fault of the scoring algorithmn
Small difference in the % of heavily targeted proteins that appear in the test set (oncoKB test set had fewer proteins that were heavily targeted)
Poor performance is likely due to high diversity in the OncoKB test set, making it harder for the model. This is because the model has to be good at predicting a larger range of proteins.
#### Full distribution:
We can see that the oncoKB test split has way more "lowly" targeted proteins in the test set, which **are naturally going to be harder for the model to be evaluated on due to the range of proteins that the model has to be good at**.
This is counter intuitive but the reason for this was because we hand picked the more heavily targeted proteins for the test set, and because we want the test sets to remain the same size, the code that I wrote picked lowly targeted proteins in order to reach that size constraint.
Whereas the random split has a more even distribution, mimicing more closely the training split.
![image](https://github.com/user-attachments/assets/63b27643-82e6-40c0-af8f-58727ad0f84c)
#### Limit to top n frequent proteins
This really shows the impact of hand picking the proteins for the test set. They are shown in the spikes of green from the oncokb test split.
![image](https://github.com/user-attachments/assets/0b1c4509-a11b-48bb-9f98-ab729f3ffc8f)
![image](https://github.com/user-attachments/assets/d53d5d00-fdee-45cd-b2a0-c6a6f5bacd3e)
![image](https://github.com/user-attachments/assets/3b065e27-0949-463c-9978-0b702dd49433)
```python
#%%
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
new = '/cluster/home/t122995uhn/projects/splits/new/pdbbind/'
train_df = pd.concat([pd.read_csv(f'{new}train0.csv'),
pd.read_csv(f'{new}val0.csv')], axis=0)
test_df = pd.read_csv(f'{new}test.csv')
all_df = pd.concat([train_df, test_df], axis=0)
print(len(all_df))
#%%
old = '/cluster/home/t122995uhn/projects/splits/old/pdbbind/'
old_test_df = pd.read_csv(f'{old}test.csv')
old_train_df = all_df[~all_df['code'].isin(old_test_df['code'])]
# %%
# this will give us an estimate to how well targeted the training proteins are vs the test proteins
def proteins_targeted(train_df, test_df, split='new', min_freq=0, normalized=False):
# protein count comparison (number of diverse proteins)
plt.figure(figsize=(18,8))
# x-axis is the normalized frequency, y-axis is the number of proteins that have that frequency (also normalized)
vc = train_df.prot_id.value_counts()
vc = vc[vc > min_freq]
train_counts = list(vc/len(test_df)) if normalized else vc.values
vc = test_df.prot_id.value_counts()
vc = vc[vc > min_freq]
test_counts = list(vc/len(test_df)) if normalized else vc.values
sns.histplot(train_counts,
bins=50, stat='density', color='green', alpha=0.4)
sns.histplot(test_counts,
bins=50,stat='density', color='blue', alpha=0.4)
sns.kdeplot(train_counts, color='green', alpha=0.8)
sns.kdeplot(test_counts, color='blue', alpha=0.8)
plt.xlabel(f"{'normalized ' if normalized else ''} frequency")
plt.ylabel("normalized number of proteins with that frequency")
plt.title(f"Targeted differences for {split} split{f' (> {min_freq})' if min_freq else ''}")
if not normalized:
plt.xlim(-8,100)
# proteins_targeted(old_train_df, old_test_df, split='oncoKB')
# plt.show()
# proteins_targeted(train_df, test_df, split='random')
# plt.show()
proteins_targeted(old_test_df, test_df, split='oncoKB(green) vs random(blue) test')
plt.show()
proteins_targeted(old_test_df, test_df, split='oncoKB(green) vs random(blue) test', min_freq=5)
plt.show()
proteins_targeted(old_test_df, test_df, split='oncoKB(green) vs random(blue) test', min_freq=10)
plt.show()
proteins_targeted(old_test_df, test_df, split='oncoKB(green) vs random(blue) test', min_freq=15)
plt.show()
proteins_targeted(old_test_df, test_df, split='oncoKB(green) vs random(blue) test', min_freq=20)
plt.show()
```
Code
No difference was found: -5.5718 (for random) vs -5.914 (for oncoKB split)
```python
#%%
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
new = '/cluster/home/t122995uhn/projects/splits/new/pdbbind/'
train_df = pd.concat([pd.read_csv(f'{new}train0.csv'),
pd.read_csv(f'{new}val0.csv')], axis=0)
test_df = pd.read_csv(f'{new}test.csv')
all_df = pd.concat([train_df, test_df], axis=0)
print(len(all_df))
#%%
old = '/cluster/home/t122995uhn/projects/splits/old/pdbbind/'
old_test_df = pd.read_csv(f'{old}test.csv')
old_train_df = all_df[~all_df['code'].isin(old_test_df['code'])]
# %%
from Bio import pairwise2
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
from Bio.Align import substitution_matrices
from tqdm import tqdm
import random
def get_group_similarity(group1, group2):
# Choose a substitution matrix (e.g., BLOSUM62)
matrix = substitution_matrices.load("BLOSUM62")
# Define gap penalties
gap_open = -10
gap_extend = -0.5
# Function to calculate pairwise similarity score
def calculate_similarity(seq1, seq2):
alignments = pairwise2.align.globalds(seq1, seq2, matrix, gap_open, gap_extend)
return alignments[0][2] # Return the score of the best alignment
# Compute pairwise similarity between all sequences in group1 and group2
similarity_scores = []
for seq1 in group1:
for seq2 in group2:
score = calculate_similarity(seq1, seq2)
similarity_scores.append(score)
# Calculate the average similarity score
average_similarity = sum(similarity_scores) / len(similarity_scores)
return similarity_scores, average_similarity
# sample 10 sequences randomly 100x
train_seq = old_train_df.prot_seq.drop_duplicates().to_list()
test_seq = old_test_df.prot_seq.drop_duplicates().to_list()
sample_size = 5
trials = 100
est_similarity = 0
for _ in tqdm(range(trials)):
_, avg = get_group_similarity(random.sample(train_seq, sample_size),
random.sample(test_seq, sample_size))
est_similarity += avg
print(est_similarity/1000)
```
Code
There is a significant distribution drift due to the new training split we had created to exclude any proteins in OncoKB...
TODOs: