Closed samWieczorek closed 2 years ago
This test has been integrated in DAPAR. In a second time, need to be integrated in Prostar with the peptide level pipeline
I've had a look at it, and I think that a first integration (at least) of PEPA can be done quite easily:
Concretely: 1/ the heaviest thing in PEPA is the calculation of the related components (until we can integrate my fast version, cf. ticket #18815). On the other hand, this calculation depends only on the pep-prot matrix (named X), and thus this calculation can be done once for all comparisons. It would thus be necessary to take this part of the pepa.test function to make a separate function, and add its result in the parameters of the pepa.test function. To externalise this function, we just need to: (1) duplicate lines 421 and 422 and (2) extract the part from line 444 to 451. The result of the function is cc, directly called in the rest of the pepa.test code. 2/ make a function that first calculates cc, then calls the pepa.test function with the default parameters (and thus cc in addition), in order to calculate the p-values for all possible pairs of comparisons (in 1vs1 or 1vsAll). The list of p-values at the output of pepa.test is in the llr.map.pv object.
Later on we can improve the integration:
The PEPA test should only be available for peptide data, while Limma and t-test should only be available for protein data.
In the future I could make a function that does a grouped t-test at the peptide level (but without taking the shared peptides) In the same way, we could also add MSqRob. However, until then, there will only be one peptitic test, namely PEPA...
``Here is a new version of the PEPA code.
cc.new <- lapply(cc.fast, FUN=function(v){ as.character(c(which(proteinNames %in% v$proteins),(which(peptideNames %in% v$peptides)+length(proteinNames)))) } ) return(cc.new) }
accelerated.pepa.test <- function(X, y, cc,n1, n2, global=FALSE, use.lm=FALSE){ cc <- adapt.cc(cc, rownames(X), colnames(X)) n <- n1+n2 q <- nrow(X) # Number of peptides p <- ncol(X) # Number of proteins if(nrow(y) != q){ stop('[pepa.test] y must have the same number of rows as X (one per peptide)') } y1 <- y[, 1:n1] # SAM: a remplacer pour tenir compte des contrasts 1vs1 et 1vsAll y2 <- y[, -(1:n1)] # SAM: a remplacer pour tenir compte des contrasts 1vs1 et 1vsAll if(global){ lik0 <- LH0(X, y1, y2) mse.h0 <- lik0$ss/(qn) llr <- mse.h1 <- rep(NA, p) for(jj in 1:p){ print(sprintf('[pepa.test] Computing H1 likelihood for protein %d', jj)) lik1 <- LH1(X, y1, y2, jj) mse.h1[jj] <- lik1$ss/(qn) llr[jj] <- (qn) (log(lik0$ss) - log(lik1$ss)) }
llr.map <- llr.map.pv <- s <- wchi2 <- NULL }else{
llr <- lh1 <- lh0 <- n.pep <- rep(NA, p) mse.h0 <- mse.h1 <- c() ii <- 0 for(ee in cc){ ii <- ii + 1 print(sprintf('[pepa.test] Computing H0 likelihood for connected component %d/%d', ii, length(cc))) ee <- as.numeric(ee) local.prot <- ee[ee <= p] if(length(local.prot) == 0){next()} local.pep <- ee[ee > p] - p local.X <- X[local.pep, local.prot, drop=FALSE] prot.barcode <- apply(local.X, 2, FUN=function(v) paste(as.character(v), collapse='-')) bc.dup <- duplicated(prot.barcode) equiv.X <- local.X[, !bc.dup, drop=FALSE] if(use.lm){ lik0 <- LH0.lm(equiv.X, y1[local.pep, , drop=FALSE], y2[local.pep, , drop=FALSE]) mse.h0[ii] <- mean(residuals(lik0$lmm.res)^2) }else{ lik0 <- LH0(equiv.X, y1[local.pep, , drop=FALSE], y2[local.pep, , drop=FALSE]) mse.h0[ii] <- lik0$ss/(length(local.pep)n) } for(jj in local.prot){ print(sprintf('[pepa.test] Computing H1 likelihood for protein %d', jj)) prot.equiv.idx <- match(names(prot.barcode[!bc.dup])[prot.barcode[!bc.dup] == prot.barcode[colnames(X)[jj]]], colnames(equiv.X)) if(use.lm){ lik1 <- LH1.lm(equiv.X, y1[local.pep, , drop=FALSE], y2[local.pep, , drop=FALSE], prot.equiv.idx) mse.h1[jj] <- mean(residuals(lik1$lmm.res)^2) llr[jj] <- 2(lik1$ll - lik0$ll) }else{ lik1 <- LH1(equiv.X, y1[local.pep, , drop=FALSE], y2[local.pep, , drop=FALSE], prot.equiv.idx) mse.h1[jj] <- lik1$ss/(length(local.pep)n) llr[jj] <- (length(local.pep)n) * (log(lik0$ss) - log(lik1$ss)) } } } s <- mse.h1 if(p < 500){# Following SAM paper recommendations, we don't attempt to estimate s0 if p<500 s1 <- quantile(s, 0.05) }else{ s1 <- fudge2LRT(mse.h0, mse.h1, cc, n, p, s)$s.zero } samLRT.res <- samLRT(mse.h0, mse.h1, cc, n, p, s1) llr.map <- samLRT.res$llr.sam
s2.est <- mean(s) wchi2 <- rep((s2.est + s1)/s2.est, length(de)) llr.map.pv <- 1 - pchisq(wchi2*llr.map, 1) }
llr.pv <- 1 - pchisq(llr, 1) return(list(llr=llr, llr.map=llr.map, llr.pv=llr.pv, llr.map.pv=llr.map.pv, mse.h0=mse.h0, mse.h1=mse.h1, s=s, wchi2=wchi2)) }
Describe the feature
Implement the following code
`` adapt.cc <- function(cc.fast, peptideNames, proteinNames){ cc.new <- lapply(cc.fast, FUN=function(v){ as.character(c(which(proteinNames %in% v$proteins),(which(peptideNames %in% v$peptides)+length(proteinNames)))) } ) return(cc.new) }
cc <- adapt.cc(get.pep.prot.cc(X), rownames(X), colnames(X))
accelerated.pepa.test <- function(X, y, cc,n1, n2, global=FALSE, use.lm=FALSE){
cc <- adapt.cc(cc, rownames(X), colnames(X)) n <- n1+n2 q <- nrow(X) # Number of peptides p <- ncol(X) # Number of proteins if(nrow(y) != q){ stop('[pepa.test] y must have the same number of rows as X (one per peptide)') }
y1 <- y[, 1:n1] # SAM: a remplacer pour tenir compte des contrasts 1vs1 et 1vsAll y2 <- y[, -(1:n1)] # SAM: a remplacer pour tenir compte des contrasts 1vs1 et 1vsAll
if(global){ lik0 <- LH0(X, y1, y2) mse.h0 <- lik0$ss/(qn) llr <- mse.h1 <- rep(NA, p) for(jj in 1:p){ print(sprintf('[pepa.test] Computing H1 likelihood for protein %d', jj)) lik1 <- LH1(X, y1, y2, jj)
mse.h1[jj] <- lik1$ss/(qn)
llr[jj] <- (qn) (log(lik0$ss) - log(lik1$ss)) }
We don't compute a regularized version of global-PEPA which is already a form of ridging
}else{
Compute 'local' likelihoods, using only peptides belonging to
}
Compute p-value for llr
llr.pv <- 1 - pchisq(llr, 1)
return(list(llr=llr, llr.map=llr.map, llr.pv=llr.pv, llr.map.pv=llr.map.pv, mse.h0=mse.h0, mse.h1=mse.h1, s=s, wchi2=wchi2)) } ``