[FEATURE DaparToolshed] Compute connected components

samWieczorek commented 2 years ago

Describe the feature I found a way to speed up (almost x100) the calculation of the CCs needed for PEPA, so that it could also be used to visuate the pepe-prot relationship. here is the code.

the core function is get.pep.prot.cc()
in the long run it will be necessary to include it in PEPA, and to modify the code (with Laurent Jacob) so that the calculation of the CC is not done any more in PEPA, but recovers the result of get.pep.prot.cc() to work
there is also a display.CC function to make the visu (for later)
the rest of the code is to help you to get used to it...


get.pep.prot.cc <- function(X){
--
p <- dim(X)[2] # Nb proteins
q <- dim(X)[1] # Nb peptides
 
### Adjacency matrix construction
A <- as.matrix(t(X))%&%as.matrix(X) # boolean matrix product
diag(A) <- rep(0,p) # remove self-connecting edges
A <- matrix(as.numeric(A[,]), ncol=p) # goes back to classical matrix format
colnames(A) <- rownames(A) <- colnames(X) # reset pep and prot names
SingleProt.CC.id <- which(rowSums(A)==0) # proteins with no shared peptides
B <- A[-SingleProt.CC.id,-SingleProt.CC.id] # matrix with no 1-prot CC
 
### Protein CCs
singprot.cc <- as.list(names(SingleProt.CC.id))
g <- graphAM(B, edgemode='undirected', values=NA)
multprot.cc <- connComp(as(g, 'graphNEL'))
 
### Peptides from single prot CCs
singprot.cc.pep <- list()
for(i in 1:length(singprot.cc)){
peplist <- which(X[,singprot.cc[[i]]]!=0)
singprot.cc.pep[[i]] <- names(peplist)
}
 
### Peptides from multiple prot CCs
multprot.cc.pep <- list()
for(i in 1:length(multprot.cc)){
protlist <- multprot.cc[[i]]
subX <- X[,protlist]
peplist <- which(rowSums(subX)!=0)
multprot.cc.pep[[i]] <- names(peplist)
}
 
### Merge results into a single list
prot.cc <- c(multprot.cc, singprot.cc)
pep.cc <- c(multprot.cc.pep, singprot.cc.pep)
global.cc <- list()
for(i in 1:length(prot.cc)){
prot <- prot.cc[[i]]
pep <- pep.cc[[i]]
tmp <- list(prot,pep)
names(tmp) <- c("proteins", "peptides")
global.cc[[i]] <- tmp
}
 
### Clean memory and return result
rm(A,B,g,multprot.cc,singprot.cc,multprot.cc.pep,singprot.cc.pep,prot.cc,pep.cc)
gc()
return(global.cc)
}
 
 
### Toy example
###############
library(Matrix)
library(igraph)
list.of.cc <- get.pep.prot.cc(X)
 
# X must be a matrix construct as follow:
# X <- as.matrix(BuildAdjacencyMatrix(data, 'Protein.group.IDs', unique = FALSE))
# NB: possibly, remove as.matrix, as BuildAdjacencyMatrix returns a sparse matrix adapted to %&%
 
list.of.cc[[12]] # see the structure
length(list.of.cc) # number of CCs
cc.summary <- sapply(list.of.cc, function(x){c(length(x[[1]]),length(x[[2]]))})
rownames(cc.summary) <- c("Nb_proteins","Nb_peptides")
colSums(cc.summary) # total amount of pep and prot in each CC
colnames(cc.summary) <- 1:length(list.of.cc)
cc.summary
rowSums(cc.summary) # c(number of prot, number of pep)
 
plot(jitter(cc.summary[2,]),jitter(cc.summary[1,]), type="p", xlab="#peptides in CC", ylab="#proteins in CC")
 
# dprot <- max(cc.summary[1,])
# dpep <- max(cc.summary[2,])
# bisto <- matrix(rep(0,dprot*dpep), ncol=dpep)
# for(i in 1:ncol(cc.summary)){
#   bisto[cc.summary[1,i],cc.summary[2,i]] <- bisto[cc.summary[1,i],cc.summary[2,i]]+1
# }
# sum(bisto)
# rowSums(bisto)
# colSums(bisto)
# binsto <- bisto
# binsto[which(bisto==0)] <- -300
# image(t(binsto+300))
 
 
###############
 
display.CC <- function(The.CC,X,y){
col.prot <- "red"
col.spec <- "green"
col.shared <- "blue"
subX <- X[The.CC$peptides, The.CC$proteins]
nb.prot <- length(The.CC$proteins)
nb.pep <- length(The.CC$peptides)
edge.list <- which(subX==1, arr.ind=TRUE)
if(nb.prot==1){
if(nb.pep == 1){
#print("Single protein and single peptide graph - nothing to display")
net <- make_bipartite_graph( c(0,1), 1:2)
}
else{
edge.list <- cbind(rep(1,nb.pep), edge.list+1)
net <- make_bipartite_graph( c(rep(0,nb.prot), rep(1,nb.pep)), as.vector(t(edge.list)))
}
V(net)$label <- c(paste("PROT", The.CC$proteins),paste("pep", The.CC$peptides))
#V(net)$size <- c(rep(2,nb.prot), rep(1,nb.pep))
# display y (quantitative data) for peptides (proteins?)
V(net)$color <- c(rep(col.prot,nb.prot), rep(col.spec,nb.pep))
}
else{
edge.list[,1] <-edge.list[,1]+nb.prot
net <- make_bipartite_graph( c(rep(0,nb.prot), rep(1,nb.pep)), as.vector(t(edge.list)))
V(net)$label <- c(paste("PROT", The.CC$proteins),paste("pep", The.CC$peptides))
#V(net)$size <- c(rep(2,nb.prot), rep(1,nb.pep))
# display y (quantitative data) for peptides (proteins?)
V(net)$color <- c(rep(col.prot,nb.prot), rep(col.shared,nb.pep))
V(net)$color[(which(rowSums(subX)==1))+nb.prot] <- col.spec
}
V(net)$type <- c(rep("PROTEIN",nb.prot), rep("PEPTIDE",nb.pep))
hchart(net, layout = layout_nicely)
}
 
display.CC(list.of.cc[[12]],X,y)
display.CC(list.of.cc[[92]],X,y)
display.CC(list.of.cc[[2001]],X,y)
 
# hchart(net, layout = layout_with_fr)
# hchart(net, layout = layout_as_bipartite)
# hchart(net, layout = layout_with_dh)
# hchart(net, layout = layout_with_gem)
# hchart(net, layout = layout_with_graphopt )
# # bar-joseph layout ?

get.pep.prot.cc <- function(X){
  p <- dim(X)[[2](https://bioproj.extra.cea.fr/redmine/attachments/29901#L2)] # Nb proteins
  q <- dim(X)[1] # Nb peptides

  ### Adjacency matrix construction
  A <- as.matrix(t(X))%&%as.matrix(X) # boolean matrix product
  diag(A) <- rep(0,p) # remove self-connecting edges
  A <- matrix(as.numeric(A[,]), ncol=p) # goes back to classical matrix format
  colnames(A) <- rownames(A) <- colnames(X) # reset pep and prot names
  SingleProt.CC.id <- which(rowSums(A)==0) # proteins with no shared peptides
  B <- A[-SingleProt.CC.id,-SingleProt.CC.id] # matrix with no 1-prot CC

  ### Protein CCs
  singprot.cc <- as.list(names(SingleProt.CC.id))
  g <- graphAM(B, edgemode='undirected', values=NA)
  multprot.cc <- connComp(as(g, 'graphNEL'))

  ### Peptides from single prot CCs
  singprot.cc.pep <- list()
  for(i in 1:length(singprot.cc)){
    peplist <- which(X[,singprot.cc[[i]]]!=0)
    singprot.cc.pep[[i]] <- names(peplist)
  }

  ### Peptides from multiple prot CCs
  multprot.cc.pep <- list()
  for(i in 1:length(multprot.cc)){
    protlist <- multprot.cc[[i]]
    subX <- X[,protlist]
    peplist <- which(rowSums(subX)!=0)
    multprot.cc.pep[[i]] <- names(peplist)
  }

  ### Merge results into a single list
  prot.cc <- c(multprot.cc, singprot.cc)
  pep.cc <- c(multprot.cc.pep, singprot.cc.pep)
  global.cc <- list()
  for(i in 1:length(prot.cc)){
    prot <- prot.cc[[i]]
    pep <- pep.cc[[i]]
    tmp <- list(prot,pep)
    names(tmp) <- c("proteins", "peptides")
    global.cc[[i]] <- tmp
  }

  ### Clean memory and return result
  rm(A,B,g,multprot.cc,singprot.cc,multprot.cc.pep,singprot.cc.pep,prot.cc,pep.cc)
  gc()
  return(global.cc)
}
### Toy example
###############
library(Matrix)
library(igraph)
list.of.cc <- get.pep.prot.cc(X)
# X must be a matrix construct as follow:
# X <- as.matrix(BuildAdjacencyMatrix(data, 'Protein.group.IDs', unique = FALSE))
# NB: possibly, remove as.matrix, as BuildAdjacencyMatrix returns a sparse matrix adapted to %&%
list.of.cc[[12]] # see the structure
length(list.of.cc) # number of CCs
cc.summary <- sapply(list.of.cc, function(x){c(length(x[[1]]),length(x[[2]]))})
rownames(cc.summary) <- c("Nb_proteins","Nb_peptides")
colSums(cc.summary) # total amount of pep and prot in each CC
colnames(cc.summary) <- 1:length(list.of.cc)
cc.summary
rowSums(cc.summary) # c(number of prot, number of pep)
plot(jitter(cc.summary[2,]),jitter(cc.summary[1,]), type="p", xlab="#peptides in CC", ylab="#proteins in CC")
# dprot <- max(cc.summary[1,])
# dpep <- max(cc.summary[2,])
# bisto <- matrix(rep(0,dprot*dpep), ncol=dpep)
# for(i in 1:ncol(cc.summary)){
#   bisto[cc.summary[1,i],cc.summary[2,i]] <- bisto[cc.summary[1,i],cc.summary[2,i]]+1
# }
# sum(bisto)
# rowSums(bisto)
# colSums(bisto)
# binsto <- bisto
# binsto[which(bisto==0)] <- -[3](https://bioproj.extra.cea.fr/redmine/attachments/29901#L3)00
# image(t(binsto+300))
###############
display.CC <- function(The.CC,X,y){
  col.prot <- "red"
  col.spec <- "green"
  col.shared <- "blue"
  subX <- X[The.CC$peptides, The.CC$proteins]
  nb.prot <- length(The.CC$proteins)
  nb.pep <- length(The.CC$peptides)
  edge.list <- which(subX==1, arr.ind=TRUE)
  if(nb.prot==1){
    if(nb.pep == 1){
      #print("Single protein and single peptide graph - nothing to display")
      net <- make_bipartite_graph( c(0,1), 1:2)
    }
    else{
      edge.list <- cbind(rep(1,nb.pep), edge.list+1)
      net <- make_bipartite_graph( c(rep(0,nb.prot), rep(1,nb.pep)), as.vector(t(edge.list)))
    }
    V(net)$label <- c(paste("PROT", The.CC$proteins),paste("pep", The.CC$peptides))
    #V(net)$size <- c(rep(2,nb.prot), rep(1,nb.pep))
    # display y (quantitative data) for peptides (proteins?)
    V(net)$color <- c(rep(col.prot,nb.prot), rep(col.spec,nb.pep))
  }
  else{
    edge.list[,1] <-edge.list[,1]+nb.prot
    net <- make_bipartite_graph( c(rep(0,nb.prot), rep(1,nb.pep)), as.vector(t(edge.list)))
    V(net)$label <- c(paste("PROT", The.CC$proteins),paste("pep", The.CC$peptides))
    #V(net)$size <- c(rep(2,nb.prot), rep(1,nb.pep))
    # display y (quantitative data) for peptides (proteins?)
    V(net)$color <- c(rep(col.prot,nb.prot), rep(col.shared,nb.pep))
    V(net)$color[(which(rowSums(subX)==1))+nb.prot] <- col.spec
  }
  V(net)$type <- c(rep("PROTEIN",nb.prot), rep("PEPTIDE",nb.pep))
  hchart(net, layout = layout_nicely)
}
display.CC(list.of.cc[[12]],X,y)
display.CC(list.of.cc[[92]],X,y)
display.CC(list.of.cc[[2001]],X,y)
# hchart(net, layout = layout_with_fr)
# hchart(net, layout = layout_as_bipartite)
# hchart(net, layout = layout_with_dh)
# hchart(net, layout = layout_with_gem)
# hchart(net, layout = layout_with_graphopt )
# # bar-joseph layout ?

samWieczorek commented 2 years ago

1/ In PEPA the calculation of the related components depends only on the pep-prot matrix (named X), and thus this calculation can be done once for all comparisons (1vs1 or 1vsAll). It would thus be necessary to take this part of the pepa.test function and make it a separate function, and add its result in the parameters of the pepa.test function. To externalise this function, we just need to: (1) duplicate lines 421 and 422 and (2) extract the part from line 444 to 451. The result of the function is cc, directly called in the rest of the pepa.test code. 2/ make a function that first calculates cc, then calls the pepa.test function with the default parameters (and thus cc in addition), in order to calculate the p-values for all possible pairs of comparisons (in 1vs1 or 1vsAll). The list of p-values in output of pepa.test is in the llr.map.pv object. 3/ In parallel, make a similar function for the calculation of the cc, but with the fast cade (cf. beginning of the ticket). 4/ With the help of Laurent Jacob, we will replace the slow function by the fast one.

samWieczorek commented 2 years ago

Implemented but keep the issue open to make sure that when we integrate PEPA, it uses this accelerated function

edyp-lab / DaparToolshed

[FEATURE DaparToolshed] Compute connected components #2