Open jyaacoub opened 1 week ago
Relevant columns from TCGA MAF:
Hugo_Symbol
for gene name Tumor_Sample_Barcode
for patient ID and cancer info (BCR batch code can be mapped to cancer project -> https://gdc.cancer.gov/resources-tcga-users/tcga-code-tables)
Mapping barcode to TCGA project can be done with tcga_code_tables
Tumor_Sample_Barcode
with https://gdc.cancer.gov/system/files/public/file/tcga_code_tables.zip
This was using the wrong MAF file (see https://github.com/jyaacoub/MutDTA/issues/111#issuecomment-2195617311 below), removed for brevity, details show previous output:
```python
#%% 1.Gather data for davis,kiba and pdbbind datasets
import os
import pandas as pd
import matplotlib.pyplot as plt
from src.analysis.utils import combine_dataset_pids
from src import config as cfg
df_prots = pd.read_csv('../data/test_prots.csv')
#%% 2. Load TCGA data
df_tcga = pd.read_csv('../data/TCGA_ALL.maf', sep='\t')
#%% 3. Pre filtering
df_tcga = df_tcga[df_tcga['Variant_Classification'] == 'Missense_Mutation']
df_tcga['seq_len'] = pd.to_numeric(df_tcga['Protein_position'].str.split('/').str[1])
df_tcga = df_tcga[df_tcga['seq_len'] < 5000]
df_tcga['seq_len'].plot.hist(bins=100, title="sequence length histogram capped at 5K")
plt.show()
df_tcga = df_tcga[df_tcga['seq_len'] < 1200]
df_tcga['seq_len'].plot.hist(bins=100, title="sequence length after capped at 1.2K")
#%% 4. Merging df_prots with TCGA
df_tcga['uniprot'] = df_tcga['SWISSPROT'].str.split('.').str[0]
dfm = df_tcga.merge(df_prots[df_prots.db != 'davis'],
left_on='uniprot', right_on='prot_id', how='inner')
# for davis we have to merge on HUGO_SYMBOLS
dfm_davis = df_tcga.merge(df_prots[df_prots.db == 'davis'],
left_on='Hugo_Symbol', right_on='prot_id', how='inner')
dfm = pd.concat([dfm,dfm_davis], axis=0)
del dfm_davis # to save mem
#%%
tcga_tss = pd.read_csv('../data/tcga_code_tables/tissueSourceSite.tsv', sep='\t')
tcga_bcr = pd.read_csv('../data/tcga_code_tables/bcrBatchCode.tsv', sep='\t')
tcga_codes = tcga_tss.merge(tcga_bcr.drop_duplicates(subset='Study Name'), on='Study Name', how='left')
tcga_codes = tcga_codes[['TSS Code', 'Study Abbreviation']]
#%%
# get second id to match with TSS code for cancer type
dfm['TSS Code'] = dfm['Tumor_Sample_Barcode'].str.split('-').str[1]
dfm = dfm.merge(tcga_codes, on='TSS Code', how='left')
```
## For ALL prots - BindingDB we have the following counts for each cancer type:
![image](https://github.com/jyaacoub/MutDTA/assets/50300488/50fdb109-41fc-4a26-9ec0-37502c407e82)
## With BindingDB:
```python
#%%
import pandas as pd
adf = pd.read_csv(f'../data/all_prots.csv')
adf.drop(columns=['db_idx','prot_seq', 'seq_len'], inplace=True)
# %% BindingDB:
from tdc.multi_pred import DTI
# data = DTI(name = 'BindingDB_Ki')
# data = DTI(name = 'BindingDB_IC50')
data = DTI(name = 'BindingDB_Kd')
bdf = data.get_data()
bdf = bdf[['Target_ID']]
bdf.dropna(subset='Target_ID', inplace=True)
bdf.drop_duplicates(subset='Target_ID', inplace=True)
bdf['db'] = 'BindingDB'
bdf['code'] = None
bdf.rename(columns={'Target_ID': 'prot_id'}, inplace=True)
bdf = bdf[['db', 'code', 'prot_id']] # just reordering, not really neccessary
# %% Combinging all 4 datasets
adf = pd.concat([adf, bdf])
adf.to_csv('../data/all_prots_bindingdb.csv', index=False)
```
```python
#%% 1.Gather data for davis,kiba and pdbbind datasets
import os
import pandas as pd
import matplotlib.pyplot as plt
from src.analysis.utils import combine_dataset_pids
from src import config as cfg
df_prots = pd.read_csv('../data/all_prots_bindingdb.csv')
#%% 2. Load TCGA data
df_tcga = pd.read_csv('../data/TCGA_ALL.maf', sep='\t')
#%% 3. Pre filtering
df_tcga = df_tcga[df_tcga['Variant_Classification'] == 'Missense_Mutation']
df_tcga['seq_len'] = pd.to_numeric(df_tcga['Protein_position'].str.split('/').str[1])
df_tcga = df_tcga[df_tcga['seq_len'] < 5000]
df_tcga['seq_len'].plot.hist(bins=100, title="sequence length histogram capped at 5K")
plt.show()
df_tcga = df_tcga[df_tcga['seq_len'] < 1200]
df_tcga['seq_len'].plot.hist(bins=100, title="sequence length after capped at 1.2K")
#%% 4. Merging df_prots with TCGA
df_tcga['uniprot'] = df_tcga['SWISSPROT'].str.split('.').str[0]
dfm = df_tcga.merge(df_prots[df_prots.db != 'davis'],
left_on='uniprot', right_on='prot_id', how='inner')
# for davis we have to merge on HUGO_SYMBOLS
dfm_davis = df_tcga.merge(df_prots[df_prots.db == 'davis'],
left_on='Hugo_Symbol', right_on='prot_id', how='inner')
dfm = pd.concat([dfm,dfm_davis], axis=0)
del dfm_davis # to save mem
#%%
tcga_tss = pd.read_csv('../data/tcga_code_tables/tissueSourceSite.tsv', sep='\t')
tcga_bcr = pd.read_csv('../data/tcga_code_tables/bcrBatchCode.tsv', sep='\t')
tcga_codes = tcga_tss.merge(tcga_bcr.drop_duplicates(subset='Study Name'), on='Study Name', how='left')
tcga_codes = tcga_codes[['TSS Code', 'Study Abbreviation']]
#%%
dfm['TSS Code'] = dfm['Tumor_Sample_Barcode'].str.split('-').str[1] # get second id to match with TSS code for cancer type
dfm = dfm.merge(tcga_codes, on='TSS Code', how='left')
```
Code to add BindingDB entries to **all_prots**
Code for matching (same but uses bindingdb csv)
Above results was with the incomplete MAF file and the actual MAF file (that TCGAbiolinks also uses; as per their docs) can be retrieved from https://gdc.cancer.gov/about-data/publications/mc3-2017.
1026
3600963
```python #%% 1.Gather data for davis,kiba and pdbbind datasets import os import pandas as pd import matplotlib.pyplot as plt from src.analysis.utils import combine_dataset_pids from src import config as cfg ROOT_DIR = "../downloads" merge_by_prot_id = False # making sure NA doesnt get dropped due to pandas parsing it as NaN tcga_tss = pd.read_csv(f'{ROOT_DIR}/tcga_code_tables/tissueSourceSite.tsv', sep='\t', keep_default_na=False, na_values=['-1.#IND', '1.#QNAN', '1.#IND', '-1.#QNAN', '#N/A N/A', '#N/A', 'N/A', 'n/a','', '#NA', 'NULL', 'null', 'NaN', '-NaN', 'nan', '-nan', '']) tcga_tss['Study Name'] = tcga_tss['Study Name'].str.strip() tcga_bcr = pd.read_csv(f'{ROOT_DIR}/tcga_code_tables/bcrBatchCode.tsv', sep='\t', keep_default_na=False, na_values=['-1.#IND', '1.#QNAN', '1.#IND', '-1.#QNAN', '#N/A N/A', '#N/A', 'N/A', 'n/a','', '#NA', 'NULL', 'null', 'NaN', '-NaN', 'nan', '-nan', '']) tcga_codes = tcga_tss.merge(tcga_bcr.drop_duplicates(subset='Study Name'), on='Study Name', how='left') tcga_codes = tcga_codes[['TSS Code', 'Study Abbreviation']] #%% Load up db df_tcga = pd.read_csv(f'/cluster/home/t122995uhn/projects/data/tcga/mc3/mc3.v0.2.8.PUBLIC.maf', sep='\t', na_filter=False) df_tcga = df_tcga[['Tumor_Sample_Barcode', 'Hugo_Symbol', 'SWISSPROT', 'Variant_Type', 'Variant_Classification']] df_tcga['case'] = df_tcga['Tumor_Sample_Barcode'].str[:12] df_tcga['uniprot'] = df_tcga['SWISSPROT'].str.split('_').str[0] # merge with tcga codes # Using second id to match with TSS code for cancer type df_tcga['TSS Code'] = df_tcga['Tumor_Sample_Barcode'].str.split('-').str[1] df_tcga = df_tcga.merge(tcga_codes, on='TSS Code', how='left') # 3. Drop duplicates df_tcga_uni=df_tcga.drop_duplicates(subset='Tumor_Sample_Barcode') print(len(df_tcga_uni)) print(df_tcga_uni['Study Abbreviation'].value_counts()) ```
8214
91501
```python #%% 4. Merging df_prots with TCGA df_prots = pd.read_csv(f'{ROOT_DIR}/test_prots_gene_names.csv') # df_prots = df_prots[df_prots.db != 'BindingDB'] if merge_by_prot_id: dfm = df_tcga.merge(df_prots[df_prots.db != 'davis'], left_on='uniprot', right_on='prot_id', how='inner') # for davis we have to merge on HUGO_SYMBOLS dfm_davis = df_tcga.merge(df_prots[df_prots.db == 'davis'], left_on='Hugo_Symbol', right_on='prot_id', how='inner') dfm = pd.concat([dfm,dfm_davis], axis=0) del dfm_davis # to save mem else: # merge by gene name dfm = df_tcga.merge(df_prots, left_on='Hugo_Symbol', right_on='gene_name', how='inner') #%% dfm_uni = dfm.drop_duplicates('case') print('Total # of cases:', len(dfm_uni)) print('Total # of rows:',len(dfm)) print('######','Cancer Types:', "######") print(dfm_uni['Study Abbreviation'].value_counts()) print('######','Variants:', "######") print(dfm_uni['Variant_Classification'].value_counts()) ```
Test set level table
Gene level cancer type heatmap
patient specific bar graph stratified by cancer type: