samWieczorek / old_Prostar.2.0

0 stars 0 forks source link

[FEATURE Prostar 2] New statistical test #10

Closed samWieczorek closed 2 years ago

samWieczorek commented 2 years ago

Describe the feature

Implement the following code

`` adapt.cc <- function(cc.fast, peptideNames, proteinNames){ cc.new <- lapply(cc.fast, FUN=function(v){ as.character(c(which(proteinNames %in% v$proteins),(which(peptideNames %in% v$peptides)+length(proteinNames)))) } ) return(cc.new) }

cc <- adapt.cc(get.pep.prot.cc(X), rownames(X), colnames(X))

accelerated.pepa.test <- function(X, y, cc,n1, n2, global=FALSE, use.lm=FALSE){

cc <- adapt.cc(cc, rownames(X), colnames(X)) n <- n1+n2 q <- nrow(X) # Number of peptides p <- ncol(X) # Number of proteins if(nrow(y) != q){ stop('[pepa.test] y must have the same number of rows as X (one per peptide)') }

y1 <- y[, 1:n1] # SAM: a remplacer pour tenir compte des contrasts 1vs1 et 1vsAll y2 <- y[, -(1:n1)] # SAM: a remplacer pour tenir compte des contrasts 1vs1 et 1vsAll

if(global){ lik0 <- LH0(X, y1, y2) mse.h0 <- lik0$ss/(qn) llr <- mse.h1 <- rep(NA, p) for(jj in 1:p){ print(sprintf('[pepa.test] Computing H1 likelihood for protein %d', jj)) lik1 <- LH1(X, y1, y2, jj)
mse.h1[jj] <- lik1$ss/(q
n)
llr[jj] <- (qn) (log(lik0$ss) - log(lik1$ss)) }

We don't compute a regularized version of global-PEPA which is already a form of ridging

llr.map <- llr.map.pv <- s <- wchi2 <- NULL

}else{

Compute 'local' likelihoods, using only peptides belonging to

## protein of interest, or connected proteins.    
llr <- lh1 <- lh0 <- n.pep <- rep(NA, p)
mse.h0 <- mse.h1 <- c()
ii <- 0
for(ee in cc){
  ii <- ii + 1
  print(sprintf('[pepa.test] Computing H0 likelihood for connected component %d/%d', ii, length(cc)))                
  ee <- as.numeric(ee)
  local.prot <- ee[ee <= p]
  if(length(local.prot) == 0){next()}
  local.pep <- ee[ee > p] - p
  local.X <- X[local.pep, local.prot, drop=FALSE]
  prot.barcode <- apply(local.X, 2, FUN=function(v) paste(as.character(v), collapse='-'))
  bc.dup <- duplicated(prot.barcode)
  equiv.X <- local.X[, !bc.dup, drop=FALSE]

  if(use.lm){
    lik0 <- LH0.lm(equiv.X, y1[local.pep, , drop=FALSE], y2[local.pep, , drop=FALSE])
    mse.h0[ii] <- mean(residuals(lik0$lmm.res)^2)
  }else{
    lik0 <- LH0(equiv.X, y1[local.pep, , drop=FALSE], y2[local.pep, , drop=FALSE])
    mse.h0[ii] <- lik0$ss/(length(local.pep)*n)
  }

  for(jj in local.prot){
    print(sprintf('[pepa.test] Computing H1 likelihood for protein %d', jj))
    prot.equiv.idx <- match(names(prot.barcode[!bc.dup])[prot.barcode[!bc.dup] == prot.barcode[colnames(X)[jj]]], colnames(equiv.X))

    if(use.lm){
      lik1 <- LH1.lm(equiv.X, y1[local.pep, , drop=FALSE], y2[local.pep, , drop=FALSE], prot.equiv.idx)
      mse.h1[jj] <- mean(residuals(lik1$lmm.res)^2)
      llr[jj] <- 2*(lik1$ll - lik0$ll)
    }else{
      lik1 <- LH1(equiv.X, y1[local.pep, , drop=FALSE], y2[local.pep, , drop=FALSE], prot.equiv.idx)
      mse.h1[jj] <- lik1$ss/(length(local.pep)*n)    
      llr[jj] <- (length(local.pep)*n) * (log(lik0$ss) - log(lik1$ss))
    }
  }
}

s <- mse.h1   
if(p < 500){# Following SAM paper recommendations, we don't attempt to estimate s0 if p<500
  s1 <- quantile(s, 0.05)
}else{
  s1 <- fudge2LRT(mse.h0, mse.h1, cc, n, p, s)$s.zero
}
samLRT.res <- samLRT(mse.h0, mse.h1, cc, n, p, s1)
llr.map <- samLRT.res$llr.sam
## Reweight LLR-sam for calibration of its p-value
s2.est <- mean(s)
wchi2 <- rep((s2.est + s1)/s2.est, length(de))
llr.map.pv <- 1 - pchisq(wchi2*llr.map, 1)

}

Compute p-value for llr

llr.pv <- 1 - pchisq(llr, 1)

return(list(llr=llr, llr.map=llr.map, llr.pv=llr.pv, llr.map.pv=llr.map.pv, mse.h0=mse.h0, mse.h1=mse.h1, s=s, wchi2=wchi2)) } ``

samWieczorek commented 2 years ago

This test has been integrated in DAPAR. In a second time, need to be integrated in Prostar with the peptide level pipeline

samWieczorek commented 2 years ago

I've had a look at it, and I think that a first integration (at least) of PEPA can be done quite easily:

Concretely: 1/ the heaviest thing in PEPA is the calculation of the related components (until we can integrate my fast version, cf. ticket #18815). On the other hand, this calculation depends only on the pep-prot matrix (named X), and thus this calculation can be done once for all comparisons. It would thus be necessary to take this part of the pepa.test function to make a separate function, and add its result in the parameters of the pepa.test function. To externalise this function, we just need to: (1) duplicate lines 421 and 422 and (2) extract the part from line 444 to 451. The result of the function is cc, directly called in the rest of the pepa.test code. 2/ make a function that first calculates cc, then calls the pepa.test function with the default parameters (and thus cc in addition), in order to calculate the p-values for all possible pairs of comparisons (in 1vs1 or 1vsAll). The list of p-values at the output of pepa.test is in the llr.map.pv object.

Later on we can improve the integration:

The PEPA test should only be available for peptide data, while Limma and t-test should only be available for protein data.

In the future I could make a function that does a grouped t-test at the peptide level (but without taking the shared peptides) In the same way, we could also add MSqRob. However, until then, there will only be one peptitic test, namely PEPA...

samWieczorek commented 2 years ago

``Here is a new version of the PEPA code.

`` adapt.cc <- function(cc.fast, peptideNames, proteinNames){

cc.new <- lapply(cc.fast, FUN=function(v){ as.character(c(which(proteinNames %in% v$proteins),(which(peptideNames %in% v$peptides)+length(proteinNames)))) } ) return(cc.new) }

cc <- adapt.cc(get.pep.prot.cc(X), rownames(X), colnames(X))

        accelerated.pepa.test <- function(X, y, cc,n1, n2, global=FALSE, use.lm=FALSE){   cc <- adapt.cc(cc, rownames(X), colnames(X)) n <- n1+n2 q <- nrow(X) # Number of peptides p <- ncol(X) # Number of proteins if(nrow(y) != q){ stop('[pepa.test] y must have the same number of rows as X (one per peptide)') }   y1 <- y[, 1:n1] # SAM: a remplacer pour tenir compte des contrasts 1vs1 et 1vsAll y2 <- y[, -(1:n1)] # SAM: a remplacer pour tenir compte des contrasts 1vs1 et 1vsAll   if(global){ lik0 <- LH0(X, y1, y2) mse.h0 <- lik0$ss/(qn) llr <- mse.h1 <- rep(NA, p) for(jj in 1:p){ print(sprintf('[pepa.test] Computing H1 likelihood for protein %d', jj)) lik1 <- LH1(X, y1, y2, jj) mse.h1[jj] <- lik1$ss/(qn) llr[jj] <- (qn) (log(lik0$ss) - log(lik1$ss)) }

We don't compute a regularized version of global-PEPA which is already a form of ridging

llr.map <- llr.map.pv <- s <- wchi2 <- NULL }else{

Compute 'local' likelihoods, using only peptides belonging to

protein of interest, or connected proteins.

llr <- lh1 <- lh0 <- n.pep <- rep(NA, p) mse.h0 <- mse.h1 <- c() ii <- 0 for(ee in cc){ ii <- ii + 1 print(sprintf('[pepa.test] Computing H0 likelihood for connected component %d/%d', ii, length(cc))) ee <- as.numeric(ee) local.prot <- ee[ee <= p] if(length(local.prot) == 0){next()} local.pep <- ee[ee > p] - p local.X <- X[local.pep, local.prot, drop=FALSE] prot.barcode <- apply(local.X, 2, FUN=function(v) paste(as.character(v), collapse='-')) bc.dup <- duplicated(prot.barcode) equiv.X <- local.X[, !bc.dup, drop=FALSE]   if(use.lm){ lik0 <- LH0.lm(equiv.X, y1[local.pep, , drop=FALSE], y2[local.pep, , drop=FALSE]) mse.h0[ii] <- mean(residuals(lik0$lmm.res)^2) }else{ lik0 <- LH0(equiv.X, y1[local.pep, , drop=FALSE], y2[local.pep, , drop=FALSE]) mse.h0[ii] <- lik0$ss/(length(local.pep)n) }   for(jj in local.prot){ print(sprintf('[pepa.test] Computing H1 likelihood for protein %d', jj)) prot.equiv.idx <- match(names(prot.barcode[!bc.dup])[prot.barcode[!bc.dup] == prot.barcode[colnames(X)[jj]]], colnames(equiv.X))   if(use.lm){ lik1 <- LH1.lm(equiv.X, y1[local.pep, , drop=FALSE], y2[local.pep, , drop=FALSE], prot.equiv.idx) mse.h1[jj] <- mean(residuals(lik1$lmm.res)^2) llr[jj] <- 2(lik1$ll - lik0$ll) }else{ lik1 <- LH1(equiv.X, y1[local.pep, , drop=FALSE], y2[local.pep, , drop=FALSE], prot.equiv.idx) mse.h1[jj] <- lik1$ss/(length(local.pep)n) llr[jj] <- (length(local.pep)n) * (log(lik0$ss) - log(lik1$ss)) } } }   s <- mse.h1 if(p < 500){# Following SAM paper recommendations, we don't attempt to estimate s0 if p<500 s1 <- quantile(s, 0.05) }else{ s1 <- fudge2LRT(mse.h0, mse.h1, cc, n, p, s)$s.zero } samLRT.res <- samLRT(mse.h0, mse.h1, cc, n, p, s1) llr.map <- samLRT.res$llr.sam

Reweight LLR-sam for calibration of its p-value

s2.est <- mean(s) wchi2 <- rep((s2.est + s1)/s2.est, length(de)) llr.map.pv <- 1 - pchisq(wchi2*llr.map, 1) }  

Compute p-value for llr

llr.pv <- 1 - pchisq(llr, 1)   return(list(llr=llr, llr.map=llr.map, llr.pv=llr.pv, llr.map.pv=llr.map.pv, mse.h0=mse.h0, mse.h1=mse.h1, s=s, wchi2=wchi2)) }

``adapt.cc <- function(cc.fast, peptideNames, proteinNames){ cc.new <- lapply(cc.fast, FUN=function(v){ as.character(c(which(proteinNames %in% v$proteins),(which(peptideNames %in% v$peptides)+length(proteinNames)))) } ) return(cc.new) }

cc <- adapt.cc(get.pep.prot.cc(X), rownames(X), colnames(X))

accelerated.pepa.test <- function(X, y, cc,n1, n2, global=FALSE, use.lm=FALSE){

cc <- adapt.cc(cc, rownames(X), colnames(X)) n <- n1+n2 q <- nrow(X) # Number of peptides p <- ncol(X) # Number of proteins if(nrow(y) != q){ stop('[pepa.test] y must have the same number of rows as X (one per peptide)') }

y1 <- y[, 1:n1] # SAM: a remplacer pour tenir compte des contrasts 1vs1 et 1vsAll y2 <- y[, -(1:n1)] # SAM: a remplacer pour tenir compte des contrasts 1vs1 et 1vsAll

if(global){ lik0 <- LH0(X, y1, y2) mse.h0 <- lik0$ss/(qn) llr <- mse.h1 <- rep(NA, p) for(jj in 1:p){ print(sprintf('[pepa.test] Computing H1 likelihood for protein %d', jj)) lik1 <- LH1(X, y1, y2, jj)
mse.h1[jj] <- lik1$ss/(q
n)
llr[jj] <- (qn) (log(lik0$ss) - log(lik1$ss)) }

We don't compute a regularized version of global-PEPA which is already a form of ridging

llr.map <- llr.map.pv <- s <- wchi2 <- NULL

}else{

Compute 'local' likelihoods, using only peptides belonging to

## protein of interest, or connected proteins.    
llr <- lh1 <- lh0 <- n.pep <- rep(NA, p)
mse.h0 <- mse.h1 <- c()
ii <- 0
for(ee in cc){
  ii <- ii + 1
  print(sprintf('[pepa.test] Computing H0 likelihood for connected component %d/%d', ii, length(cc)))                
  ee <- as.numeric(ee)
  local.prot <- ee[ee <= p]
  if(length(local.prot) == 0){next()}
  local.pep <- ee[ee > p] - p
  local.X <- X[local.pep, local.prot, drop=FALSE]
  prot.barcode <- apply(local.X, 2, FUN=function(v) paste(as.character(v), collapse='-'))
  bc.dup <- duplicated(prot.barcode)
  equiv.X <- local.X[, !bc.dup, drop=FALSE]

  if(use.lm){
    lik0 <- LH0.lm(equiv.X, y1[local.pep, , drop=FALSE], y2[local.pep, , drop=FALSE])
    mse.h0[ii] <- mean(residuals(lik0$lmm.res)^2)
  }else{
    lik0 <- LH0(equiv.X, y1[local.pep, , drop=FALSE], y2[local.pep, , drop=FALSE])
    mse.h0[ii] <- lik0$ss/(length(local.pep)*n)
  }

  for(jj in local.prot){
    print(sprintf('[pepa.test] Computing H1 likelihood for protein %d', jj))
    prot.equiv.idx <- match(names(prot.barcode[!bc.dup])[prot.barcode[!bc.dup] == prot.barcode[colnames(X)[jj]]], colnames(equiv.X))

    if(use.lm){
      lik1 <- LH1.lm(equiv.X, y1[local.pep, , drop=FALSE], y2[local.pep, , drop=FALSE], prot.equiv.idx)
      mse.h1[jj] <- mean(residuals(lik1$lmm.res)^2)
      llr[jj] <- 2*(lik1$ll - lik0$ll)
    }else{
      lik1 <- LH1(equiv.X, y1[local.pep, , drop=FALSE], y2[local.pep, , drop=FALSE], prot.equiv.idx)
      mse.h1[jj] <- lik1$ss/(length(local.pep)*n)    
      llr[jj] <- (length(local.pep)*n) * (log(lik0$ss) - log(lik1$ss))
    }
  }
}

s <- mse.h1   
if(p < 500){# Following SAM paper recommendations, we don't attempt to estimate s0 if p<500
  s1 <- quantile(s, 0.05)
}else{
  s1 <- fudge2LRT(mse.h0, mse.h1, cc, n, p, s)$s.zero
}
samLRT.res <- samLRT(mse.h0, mse.h1, cc, n, p, s1)
llr.map <- samLRT.res$llr.sam
## Reweight LLR-sam for calibration of its p-value
s2.est <- mean(s)
wchi2 <- rep((s2.est + s1)/s2.est, length(de))
llr.map.pv <- 1 - pchisq(wchi2*llr.map, 1)

}

Compute p-value for llr

llr.pv <- 1 - pchisq(llr, 1)

return(list(llr=llr, llr.map=llr.map, llr.pv=llr.pv, llr.map.pv=llr.map.pv, mse.h0=mse.h0, mse.h1=mse.h1, s=s, wchi2=wchi2)) } ``

samWieczorek commented 2 years ago

In the first implem in Prostar, it may be relevant to keep both versions (accelerated and original) if only to check the equality of the result... even if the non-accelerated version may be very long.

The accelerated version differs from the classic version only in the case of the global=F option. Indeed, in the global case, we do not calculate the CC.

It makes me think that to save a little time at execution (but it's really not a big deal) we could call adapt.cc() only in the conditional corresponding to global=F

samWieczorek commented 2 years ago

This issue must be linked to the connected components issue and the one of group-t-test

samWieczorek commented 2 years ago

Transferred to prostarproteomics github's organization