workflow4metabolomics / metaMS

Move to https://github.com/workflow4metabolomics/tools-metabolomics
2 stars 2 forks source link

Add check MSP wrapper #23

Open yguitton opened 6 years ago

yguitton commented 6 years ago

some msp files are not fully compatible with metaMS for exemple msp file fom AMDIS have ( mz int) instead of mz int; as mass spectrum descriptor

the idea for wrapper is 1 load msp file with a new read.msp that can deal with more msp format then use write.msp format to create a converted msp file

yguitton commented 6 years ago

here a first code for : a modified read.msp file then the idea is to use the metaMS write.msp function

read.msp<-function (file, only.org = FALSE, org.set = c("C", "H", "D", 
    "N", "O", "P", "S"), noNumbers = NULL) 
{
    get.text.value <- function(x, field, do.err = TRUE) {
        woppa <- strsplit(x, field)
        woppa.lengths <- sapply(woppa, length)
        if (all(woppa.lengths == 2)) {
            sapply(woppa, function(y) gsub("^ +", "", y[2]))
        }
        else {
            if (do.err) {
                stop(paste("Invalid field", field, "in", x[woppa.lengths != 
                  2]))
            }
            else {
                NA
            }
        }
    }
    is.org <- function(strs, org.set) {
        formulas <- get.text.value(strs, "Formula:")
        org.string <- paste("[", paste(org.set, collapse = ""), 
            "]", collapse = "")
        suppressWarnings(which(!is.na(as.numeric(gsub(org.string, 
            "", formulas)))))
    }
    read.compound <- function(strs, noNumbers) {
    # print(strs[1])
        if (is.null(noNumbers)) 
             noNumbers <- c("[Nn][Aa][Mm][Ee]", "CAS?", "stdFile", "date", 
                "validated", "ChemspiderID", "SMILES", "InChI", 
                "Class", "[Cc][Oo][Mm][Mm][Ee][Nn][Tt]?", "csLinks","[fF][oO][Rr][Mm]?","[Ss][oO][Uu][Rr][Cc][Ee],RW")
        fields.idx <- grep(":", strs)
        fields <- sapply(strsplit(strs[fields.idx], ":"), "[[", 
            1)
# pk.idx <- which(fields == "[Nn][Uu][Mm] [Pp][Ee][Aa][Kk][Ss]")
        pk.idx<-grep("[Nn][Uu][Mm] [Pp][Ee][Aa][Kk][Ss]",fields)
        if (length(pk.idx) == 0) 
            stop("No spectrum found")
        cmpnd <- lapply(fields.idx[-pk.idx], function(x) get.text.value(strs[x], 
            paste(fields[x], ":", sep = ""), do.err = FALSE))
        # Rename essential fields for metaMS
        fields<-gsub("NAME","Name",fields)
        fields<-gsub("RT","rt",fields)
        fields<-gsub("RI","std.RI",fields)
        fields<-gsub("CASNO","CAS",fields)
        fields<-gsub("NUM PEAKS","Num Peaks",fields)
        names(cmpnd) <- fields[-pk.idx]
        #

        # cnvrt.idx <- which(!(names(cmpnd) %in% noNumbers))
        cnvrt.idx<-which(!grepl(paste(noNumbers,collapse="|"),fields)==FALSE)
        cmpnd[cnvrt.idx] <- lapply(cmpnd[cnvrt.idx], function(x) {
            if (is.na((y <- as.numeric(x)))) {
                x
            }
            else {
                y
            }
        })
        nlines <- length(strs)
        npeaks <- as.numeric(get.text.value(strs[pk.idx], "[Nn][Uu][Mm] [Pp][Ee][Aa][Kk][Ss]:?[:space]"))
        peaks.idx <- (pk.idx + 1):nlines
        #####
        if(length(grep(";",strs[peaks.idx]))>1){
            pks <- gsub("^ +", "", unlist(strsplit(strs[peaks.idx], ";")))

        }
        if(length(grep(";",strs[peaks.idx]))<1){
            pks <- gsub("^ +", "", unlist(strsplit(strs[peaks.idx], "\\)")))
            pks<-gsub("\\(+","",pks)

        }

        ####

        pks <- pks[pks != ""]
        if (length(pks) != npeaks) 
            stop("Not the right number of peaks in compound", 
                cmpnd$Name)
        pklst <- strsplit(pks, " ")
        pklst <- lapply(pklst, function(x) x[x != ""])
        cmz <- as.numeric(sapply(pklst, "[[", 1))
        cintens <- as.numeric(sapply(pklst, "[[", 2))
        finaltab <- matrix(c(cmz, cintens), ncol = 2)
        if (any(table(cmz) > 1)) {
            warning("Duplicate mass in compound ", cmpnd$Name, 
                " (CAS ", cmpnd$CAS, ")... summing up intensities")
            finaltab <- aggregate(finaltab[, 2], by = list(finaltab[, 
                1]), FUN = sum)
        }
        colnames(finaltab) <- c("mz", "intensity")
        # uncomment below to put NULL in remove cmpnd without RT
        # if(!is.na(match("rt",names(cmpnd))))
        c(cmpnd, list(pspectrum = finaltab))
    }
    huhn <- scan(file, what = "", sep = "\n", quiet = TRUE)
    starts <- which(regexpr("[Nn][Aa][Mm][Ee]:?[:space]", huhn) == 1)
    ends <- c(starts[-1] - 1, length(huhn))
    if (only.org) {
        formulas <- which(regexpr("Formula:", huhn) == 1)
        if (length(formulas) > 0) {
            orgs <- is.org(huhn[formulas], org.set)
            starts <- starts[orgs]
            ends <- ends[orgs]
        }
    }
    lapply(1:length(starts), function(i) read.compound(huhn[starts[i]:ends[i]], 
        noNumbers = noNumbers))
}
jsaintvanne commented 5 years ago

When you talk of msp file do you mean the databse as input ?

yguitton commented 5 years ago

Yes, as some msp files for metals should contain some fields

Le mer. 26 juin 2019 09:19, Julien Saint-Vanne notifications@github.com a écrit :

When you talk of msp file do you mean the databse as input ?

— You are receiving this because you were assigned. Reply to this email directly, view it on GitHub https://github.com/workflow4metabolomics/metaMS/issues/23?email_source=notifications&email_token=ABI76KNAUDNRRGWACXEFHN3P4MKAVA5CNFSM4EN7OIOKYY3PNVWWK3TUL52HS4DFVREXG43VMVBW63LNMVXHJKTDN5WW2ZLOORPWSZGODYSSZJQ#issuecomment-505752742, or mute the thread https://github.com/notifications/unsubscribe-auth/ABI76KNM756MKFLYEYENT2DP4MKAVANCNFSM4EN7OIOA .