How to run netMHCpan for millions of pHLA?

For users who are interested in using the SNAF wrapper function to run netMHCpan for a large number of pHLA, here's some example codes. Let me first show you how to run, then I pasted all the function definition after that. The below code running for 580K pHLAs using 20 cores finish in 28mins:

How to?

pep2hla = {
    PEPTIDE1:[HLA-A02:01,HLA-A03:01],
    PEPTIDE2:[HLA-B08:01,HLA-C03:04]
}

netMHCpan_path = '/gpfs/data/yarmarkovichlab/Frank/SNAF_ecosystem/netMHCpan-4.1/netMHCpan'
cores = 20

full_list = list(pep2hla.keys())
pool = mp.Pool(processes=cores)
list_of_sub_full_list = split_array_to_chunks(full_list,cores)
r = [pool.apply_async(func=each_chunk_func,args=(sub_full_list,)) for sub_full_list in list_of_sub_full_list]
pool.close()
pool.join()
results = []
for collect in r:
     result = collect.get()
   results.append(result)
final = pd.concat(results,axis=0)

Load package

# all should be included if you successfully installed SNAF
import os,sys
import pandas as pd
import numpy as np
from tqdm import tqdm
import multiprocessing as mp
from io import StringIO
import subprocess

Define main function

def run_netMHCpan(software_path,peptides,hlas,length,cmd_num=1,tmp_dir=None,tmp_name=None):
    '''
    on top of SNAF function, I added -BA and extract different column from the result
    also make sure to create scratch folder
    right now, only consider cmd1
    '''
    # set the default
    if tmp_dir is None:
        tmp_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)),'scratch')
    if tmp_name is None:
        tmp_name = 'input_{}.pep'.format(os.getpid())
    # reformat/create to the strings that we need
    peptides_path = os.path.join(tmp_dir,tmp_name)
    with open(peptides_path,'w') as f:
        for pep in peptides:
            f.write('{}\n'.format(pep))
    # netMHCpan has a hard limit for 1024 characters for -a, so to be safe, 90 HLA as max for one goal, each input format 11 character 'HLA-B57:01,'
    if len(hlas) <= 90:
        hla_strings = ','.join(hlas)
        if cmd_num == 1:
            reconstruct = ' || '.join(['$3 == ' + '"{}"'.format(pep) for pep in peptides])
            cmd = '{} -p {} -a {} -BA -l {} | awk \'BEGIN {{OFS = "\\t"}} {{if ({}) {{print $3, length($3),$2,$13,$16,$18}}}}\''.format(software_path,peptides_path,hla_strings,length,reconstruct)
            '''
            ../external/netMHCpan-4.1/netMHCpan -p ./test.pep -a HLA-A01:01,HLA-A02:01 -BA -l 9 | awk 'BEGIN {OFS = "\t"} {if ($3 == "AAAWYLWEV" || $3 == "AAGLQDCTM" || $3 == "AARNIVRRA") {print $3, length($3),$2,$13,$16,$18}}'
            '''
        try:
            df = pd.read_csv(StringIO(subprocess.run(cmd,shell=True,capture_output=True).stdout.decode('utf-8')),sep='\t',header=None)
            df.columns = ['peptide','mer','hla','score','binding_nM','identity']
        except:   # no stdout, just no candidates
            df = pd.DataFrame(columns=['peptide','mer','hla','score','binding_nM','identity'])

    else:
        total = len(hlas)   # 91,137,180
        df_store = []
        i = 0 
        while i < total:
            if i + 90 <= total:
                batch_hlas = hlas[i:i+90]
            else:
                batch_hlas = hlas[i:]
            hla_strings = ','.join(batch_hlas)
            if cmd_num == 1:
                reconstruct = ' || '.join(['$3 == ' + '"{}"'.format(pep) for pep in peptides])
                cmd = '{} -p {} -a {} -BA -l {} | awk \'BEGIN {{OFS = "\\t"}} {{if ({}) {{print $3, length($3),$2,$13,$16,$18}}}}\''.format(software_path,peptides_path,hla_strings,length,reconstruct)
                '''
                ../external/netMHCpan-4.1/netMHCpan -p ./test.pep -a HLA-A01:01,HLA-A02:01 -BA -l 9 | awk 'BEGIN {OFS = "\t"} {if ($3 == "AAAWYLWEV" || $3 == "AAGLQDCTM" || $3 == "AARNIVRRA") {print $3, length($3),$2,$13,$16,$18}}'
                '''
            try:
                df = pd.read_csv(StringIO(subprocess.run(cmd,shell=True,capture_output=True).stdout.decode('utf-8')),sep='\t',header=None)
                df.columns = ['peptide','mer','hla','score','binding_nM','identity']
            except:   # no stdout, just no candidates
                df = pd.DataFrame(columns=['peptide','mer','hla','score','binding_nM','identity'])
            df_store.append(df)
            i += 90  
        df = pd.concat(df_store,axis=0)      

    # remove the scratch_pid folder
    os.remove(peptides_path)

    return df

A wrapper function

def run_netMHCpan_wrapper(peptide,hlas):
    # HLA-A02:01
    df = run_netMHCpan(software_path=netMHCpan_path,
                       peptides=[peptide],
                       hlas=hlas,
                       length=len(peptide),
                       cmd_num=1,
                       tmp_dir=None,
                       tmp_name=None)
    return df

to parallelize, you need two functions

def each_chunk_func(sub_list):
    list_df = []
    for pep in tqdm(sub_list):
        peptide = pep
        hlas = pep2hla[peptide]
        df = run_netMHCpan_wrapper(peptide,hlas)
        list_df.append(df)
    final = pd.concat(list_df,axis=0)
    return final

def split_array_to_chunks(array,cores=None):
    if not isinstance(array,list):
        raise Exception('split_array_to_chunks function works for list, not ndarray')
    array_index = np.arange(len(array))
    if cores is None:
        cores = mp.cpu_count()
    sub_indices = np.array_split(array_index,cores)
    sub_arrays = []
    for sub_index in sub_indices:
        item_in_group = []
        for i in sub_index:
            item_in_group.append(array[i])
        sub_arrays.append(item_in_group)
    return sub_arrays

frankligy / SNAF

How to run netMHCpan for millions of pHLA? #46

How to?

Load package

Define main function

A wrapper function

to parallelize, you need two functions