create executable mokapot packages for linux, windows, mac

timosachsenberg commented 1 year ago

Motivation: Mokapot is evolving much faster compared to percolator. It would make a lot of sense if we could use it as a replacement for percolator. To this end it would make sense to create installer (e.g., using an action in our or the mokapot repository see also https://github.com/wfondrie/mokapot/issues/101).

Previously I was successful with

pyinstaller --onefile MokapotAdapter.py

and this script (which probably can be modernized with the pandas export etc.:

#!/usr/bin/env python3
import sys # for command line arguments
from pyopenms import *
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from idXML2df import *
from calcQ import *
import argparse

# update scores updates the scores read from the input file with e.g. class-specific q-values and stores them in output_file
# score_name is the name of the meta value. score_type_name the name of the score type used to annotate in the idXML file
def updateScores(input_file, df, output_file, score_column_name = "class-specific_q-val", score_type_name="q-value", higher_better=False):
                map_id2qvalue = dict()
                for index, row in df.iterrows():
                                key = row["SpecId"] + "_" + str(row["PSMId"])
                                map_id2qvalue[key] = row[score_column_name]

                # load old data again
                prot_ids = []; pep_ids = []
                IdXMLFile().load(input_file, prot_ids, pep_ids)

                new_pep_ids = []
                for peptide_id in pep_ids:
                                hits = peptide_id.getHits()
                                psmid = 1
                                specid = peptide_id.getMetaValue("spectrum_reference")
                                new_hits = []
                                for h in hits:
                                                key = specid + "_" + str(psmid)
                                                if key in map_id2qvalue:
                                                                h.setScore(map_id2qvalue[key]) # set new main score (will be used for sorting)
                                                                new_hits.append(h)
                                                psmid += 1
                                peptide_id.setHits(new_hits)
                                peptide_id.setScoreType(score_type_name)
                                peptide_id.setHigherScoreBetter(higher_better)
                                new_pep_ids.append(peptide_id)

                IdXMLFile().store(output_file, prot_ids, pep_ids)

def cli(input, output):
        # load NuXL results
        input_file = input
        output_file = output
        print("Reading input file: ", input_file)
        d = readAndProcessIdXML(input, top=1) # load top hits per spectrum

        ############################### rescoring
        import mokapot
        import logging
        from sklearn.model_selection import GridSearchCV
        from sklearn.ensemble import RandomForestClassifier
        from sklearn.model_selection import RepeatedStratifiedKFold

        np.random.seed(42) # for reproducible results

        # that's how one can define custom models for mokapot
        class RandomForestModel(mokapot.model.Model):
                        def __init__(
                                        self,
                                        scaler=None,
                                        train_fdr=0.1,
                                        max_iter=10,
                                        direction=None,
                                        override=False,
                                        subset_max_train=None,
                        ):

                                        # Employ random forest model from sklearn package with repeated k-fold cross validation
                                        rand_forest_model = RandomForestClassifier()
                                        cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=10)

                                        # Define a dictionary with specific parameter settings for the param_grid flag or use defaults with {}
                                        estimator = GridSearchCV(rand_forest_model, param_grid={}, refit=True, cv=cv)

                                        super().__init__(
                                                        estimator=estimator,
                                                        scaler=scaler,
                                                        train_fdr=train_fdr,
                                                        max_iter=max_iter,
                                                        direction=direction,
                                                        override=override,
                                                        subset_max_train=subset_max_train,
                                        )

        # Write out log file
        log = True
        if log:
                        logging.basicConfig(
                                        filename="mokapot.log",
                                        level=logging.INFO,
                                        format="%(levelname)s: %(message)s",
                        )
        logging.info('Started')

        # The list of features we want to use for training
        features = ["NuXL:score", 'charge2', 'charge3', 'charge4', 'charge5', 'isotope_error', 'precursor_mz_error_ppm', 'NuXL:total_loss_score', 'NuXL:partial_loss_score',
                        'variable_modifications', 'n_theoretical_peaks', 'NuXL:mass_error_p', 'NuXL:immonium_score',
                'NuXL:precursor_score', 'NuXL:marker_ions_score', 'NuXL:MIC', 'NuXL:err', 'NuXL:Morph', 'NuXL:modds', 'NuXL:pl_MIC', 'NuXL:pl_err',
                'NuXL:pl_Morph', 'NuXL:pl_modds', 'NuXL:pl_pc_MIC', 'NuXL:pl_im_MIC', 'NuXL:total_Morph', 'NuXL:total_HS', 'NuXL:tag_XLed', 'NuXL:tag_unshifted', 'NuXL:tag_shifted',
                'NuXL:total_MIC', 'NuXL:NA_length', 'NuXL:best_localization_score',
                'precursor_purity', 'NuXL:aminoacid_max_tag',
                'nr_candidates', 'NuXL:explained_peak_fraction', 'NuXL:theo_peak_fraction', 'NuXL:wTop50', 'NuXL:ladder_score', 'NuXL:sequence_score'
                ]

        psms = mokapot.dataset.LinearPsmDataset(
                d[ ["Score", "Label", "Peptide", "SpecId", "PSMId", "NuXL:isXL"] + features], # columns we pass to mokapot
                target_column="Label",
                spectrum_columns="SpecId",
                peptide_column="Peptide",
                group_column="NuXL:isXL",
                feature_columns=features)

        # Set the defined model
        moka_model = RandomForestModel() # custom model, =None would be percolator default model

        # Conduct the mokapot analysis:
        results, models = mokapot.brew(psms, model=moka_model, test_fdr=0.20)
        logging.info('Finished')

        # merge all results into one df
        results.psms = results.group_confidence_estimates[0].psms # peptide targets
        results.psms = results.psms.append(results.group_confidence_estimates[1].psms) # XL targets
        results.psms = results.psms.append(results.group_confidence_estimates[0].decoy_confidence_estimates["psms"]) # peptide decoys
        results.psms = results.psms.append(results.group_confidence_estimates[1].decoy_confidence_estimates["psms"]) # XL decoys

        # reannotate all other columns (we might need them later)
        mokapot_result = pd.merge(results.psms, d[ ['SpecId', 'ScanNr', "PSMId"] + features ], left_on=['SpecId', 'PSMId'], right_on=['SpecId', 'PSMId']) # somehow we need to merge back the features used
        mokapot_result["Label"] = mokapot_result["Label"].astype(int) # convert true/false back to 1/0

        ############################################## before rescoring
        mokapot_result["Score"] = mokapot_result["mokapot score"] # set mokapot scores as new score score so we use that for sorting. check if the two digits for RF is enough
        mokapot_result = calcQ(mokapot_result) # calculate q-values for XL and non-XL class, stored in "class-specific_q-val"
        mokapot_result = addRanks(mokapot_result)
        print(mokapot_result)

        print("Writing output file: ", output_file)
        updateScores(input_file, mokapot_result, output_file, score_column_name="mokapot score", score_type_name="svm", higher_better=True) # we call the score svm here

if __name__ == '__main__':
        parser = argparse.ArgumentParser(description='Adapter to Mokapot for NA-XL-rescoring.')
        parser.add_argument('-in', dest='input', type=str)
        parser.add_argument('-out', dest='output', type=str)
        args, unknown = parser.parse_known_args() # ignores additional parameters
        cli(args.input, args.output)

jpfeuffer commented 1 year ago

Puuh.. install a whole python environment?

timosachsenberg commented 1 year ago

it is not installed but included in the executable. Good alternatives?

jpfeuffer commented 1 year ago

Yeah I know. No, not really alternatives. Is it on conda at least?

timosachsenberg commented 1 year ago

yep conda install -c bioconda mokapot

Arslan-Siraj commented 1 year ago

So, do we want to create an executable by using pyinstaller? This is the goal?

timosachsenberg commented 1 year ago

yeah I think it would be great to have executables that could be used as "drop-in" replacement for how we currently use percolator. The way users would not have to deal with all dependencies etc.

How I did it in the past was: I took the script above and created the installer. But this also includes a full pyopenms version in the executable. This is a bit awkward and will increase the size of it a lot.

It is probably better to just create an installer for the plain mokapot executables. Then we could create a MokapotAdapter that uses this executable similar to percolatoradapter in another sprint.

OpenMS / THIRDPARTY

create executable mokapot packages for linux, windows, mac #100