Splicing Neo Antigen Finder (SNAF) is an easy-to-use Python package to identify splicing-derived tumor neoantigens from RNA sequencing data, it further leverages both deep learning and hierarchical Bayesian models to prioritize certain candidates for experimental validation
For users who are interested in using the SNAF wrapper function to run netMHCpan for a large number of pHLA, here's some example codes. Let me first show you how to run, then I pasted all the function definition after that. The below code running for 580K pHLAs using 20 cores finish in 28mins:
How to?
pep2hla = {
PEPTIDE1:[HLA-A02:01,HLA-A03:01],
PEPTIDE2:[HLA-B08:01,HLA-C03:04]
}
netMHCpan_path = '/gpfs/data/yarmarkovichlab/Frank/SNAF_ecosystem/netMHCpan-4.1/netMHCpan'
cores = 20
full_list = list(pep2hla.keys())
pool = mp.Pool(processes=cores)
list_of_sub_full_list = split_array_to_chunks(full_list,cores)
r = [pool.apply_async(func=each_chunk_func,args=(sub_full_list,)) for sub_full_list in list_of_sub_full_list]
pool.close()
pool.join()
results = []
for collect in r:
result = collect.get()
results.append(result)
final = pd.concat(results,axis=0)
Load package
# all should be included if you successfully installed SNAF
import os,sys
import pandas as pd
import numpy as np
from tqdm import tqdm
import multiprocessing as mp
from io import StringIO
import subprocess
Define main function
def run_netMHCpan(software_path,peptides,hlas,length,cmd_num=1,tmp_dir=None,tmp_name=None):
'''
on top of SNAF function, I added -BA and extract different column from the result
also make sure to create scratch folder
right now, only consider cmd1
'''
# set the default
if tmp_dir is None:
tmp_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)),'scratch')
if tmp_name is None:
tmp_name = 'input_{}.pep'.format(os.getpid())
# reformat/create to the strings that we need
peptides_path = os.path.join(tmp_dir,tmp_name)
with open(peptides_path,'w') as f:
for pep in peptides:
f.write('{}\n'.format(pep))
# netMHCpan has a hard limit for 1024 characters for -a, so to be safe, 90 HLA as max for one goal, each input format 11 character 'HLA-B57:01,'
if len(hlas) <= 90:
hla_strings = ','.join(hlas)
if cmd_num == 1:
reconstruct = ' || '.join(['$3 == ' + '"{}"'.format(pep) for pep in peptides])
cmd = '{} -p {} -a {} -BA -l {} | awk \'BEGIN {{OFS = "\\t"}} {{if ({}) {{print $3, length($3),$2,$13,$16,$18}}}}\''.format(software_path,peptides_path,hla_strings,length,reconstruct)
'''
../external/netMHCpan-4.1/netMHCpan -p ./test.pep -a HLA-A01:01,HLA-A02:01 -BA -l 9 | awk 'BEGIN {OFS = "\t"} {if ($3 == "AAAWYLWEV" || $3 == "AAGLQDCTM" || $3 == "AARNIVRRA") {print $3, length($3),$2,$13,$16,$18}}'
'''
try:
df = pd.read_csv(StringIO(subprocess.run(cmd,shell=True,capture_output=True).stdout.decode('utf-8')),sep='\t',header=None)
df.columns = ['peptide','mer','hla','score','binding_nM','identity']
except: # no stdout, just no candidates
df = pd.DataFrame(columns=['peptide','mer','hla','score','binding_nM','identity'])
else:
total = len(hlas) # 91,137,180
df_store = []
i = 0
while i < total:
if i + 90 <= total:
batch_hlas = hlas[i:i+90]
else:
batch_hlas = hlas[i:]
hla_strings = ','.join(batch_hlas)
if cmd_num == 1:
reconstruct = ' || '.join(['$3 == ' + '"{}"'.format(pep) for pep in peptides])
cmd = '{} -p {} -a {} -BA -l {} | awk \'BEGIN {{OFS = "\\t"}} {{if ({}) {{print $3, length($3),$2,$13,$16,$18}}}}\''.format(software_path,peptides_path,hla_strings,length,reconstruct)
'''
../external/netMHCpan-4.1/netMHCpan -p ./test.pep -a HLA-A01:01,HLA-A02:01 -BA -l 9 | awk 'BEGIN {OFS = "\t"} {if ($3 == "AAAWYLWEV" || $3 == "AAGLQDCTM" || $3 == "AARNIVRRA") {print $3, length($3),$2,$13,$16,$18}}'
'''
try:
df = pd.read_csv(StringIO(subprocess.run(cmd,shell=True,capture_output=True).stdout.decode('utf-8')),sep='\t',header=None)
df.columns = ['peptide','mer','hla','score','binding_nM','identity']
except: # no stdout, just no candidates
df = pd.DataFrame(columns=['peptide','mer','hla','score','binding_nM','identity'])
df_store.append(df)
i += 90
df = pd.concat(df_store,axis=0)
# remove the scratch_pid folder
os.remove(peptides_path)
return df
def each_chunk_func(sub_list):
list_df = []
for pep in tqdm(sub_list):
peptide = pep
hlas = pep2hla[peptide]
df = run_netMHCpan_wrapper(peptide,hlas)
list_df.append(df)
final = pd.concat(list_df,axis=0)
return final
def split_array_to_chunks(array,cores=None):
if not isinstance(array,list):
raise Exception('split_array_to_chunks function works for list, not ndarray')
array_index = np.arange(len(array))
if cores is None:
cores = mp.cpu_count()
sub_indices = np.array_split(array_index,cores)
sub_arrays = []
for sub_index in sub_indices:
item_in_group = []
for i in sub_index:
item_in_group.append(array[i])
sub_arrays.append(item_in_group)
return sub_arrays
For users who are interested in using the SNAF wrapper function to run netMHCpan for a large number of pHLA, here's some example codes. Let me first show you how to run, then I pasted all the function definition after that. The below code running for 580K pHLAs using 20 cores finish in 28mins:
How to?
Load package
Define main function
A wrapper function
to parallelize, you need two functions