randel / EnsDeconv

Ensemble Deconvolution to robustly estimate cellular fractions from bulk omics data
Other
5 stars 1 forks source link

Number of single cells may be limited? #1

Closed edammer closed 2 years ago

edammer commented 2 years ago

Hello @manqicai !

I am trying the EnsDeconv framework using a few single cell / single nucleus RNA-Seq datasets. I occassionally encounter the following error in my code (bold, pasted as comment where it occurs when first running the EnsDeconv( ) function). But I can bypass the error by reducing the number of columns (single cell profiles) in the ref_matrix object within the reflist list object, which I structured as the function expects.

## Load SN-RNA Data downloaded from UCSC Cell Browser: https://cells-test.gi.ucsc.edu/?ds=brain-vasc-atlas
require(Seurat)
require(data.table)
setwd("Vascular22/WyssCoray/")
mat = Read10X("./download/") #creates a sparse matrix.
meta<-read.table("./download/meta.tsv",header=T,sep="\t", as.is=TRUE, row.names=1)
genes=rownames(mat)
#so <- CreateSeuratObject(counts=mat,project="WyssCoray14cellTypes",meta.data=meta)

## Make reflist for ensDeconv
#meta$SubjectName<-rownames(meta)
meta$SamplesName<-rownames(meta)
meta$deconv_clust<-meta$Cell_Type

## Load Bulk Proteomic Data
load("../tempVascular4batchTMTratioOverGIS.TAMPORout.RData")
cleanRelAbun.Unreg<-TAMPORlist.1$cleanRelAbun
dim(cleanRelAbun.Unreg)
cleanRelAbun.Unreg<-cleanRelAbun.Unreg[,match(rownames(numericMeta),colnames(cleanRelAbun.Unreg))]
head(cleanRelAbun.Unreg)
symbols<-as.data.frame(do.call("rbind",strsplit(rownames(cleanRelAbun.Unreg),"[|]")))[,1]
cleanRelAbun.Unreg.UniqueIDs<-rownames(cleanRelAbun.Unreg)
## Collapse to unique gene product proteins
library(WGCNA)
cleanRelAbun.Unreg.CollapsedRows<-collapseRows(cleanRelAbun.Unreg,rowGroup=symbols,rowID=cleanRelAbun.Unreg.UniqueIDs,method="maxRowVariance")
cleanRelAbun.Unreg.Collapsed<-cleanRelAbun.Unreg.CollapsedRows$datETcollapsed
symbols.collapsed<-as.data.frame(do.call("rbind",strsplit(rownames(cleanRelAbun.Unreg.Collapsed),"[|]")))[,1]
rownames(cleanRelAbun.Unreg.Collapsed)<-symbols.collapsed   #cleanRelAbun.Unreg.Collapsed.ENSGIDs
cleanRelAbun.Unreg.Collapsed<-cleanRelAbun.Unreg.Collapsed[which(!rownames(cleanRelAbun.Unreg.Collapsed)=="0"),]

## Try Deconvolution, per https://randel.github.io/EnsDeconv/articles/EnsDeconv.html
library(foreach)
library(tidyverse)
library(EnsDeconv)
library(sparseMatrixStats)
library(scran)
#data(testdata)

#tried to omit NAs from count_bulk, but error for this reference list about NA/NaN/Inf is stubborm, in the EnsDeconv() function call below.
reflist<-list(count_bulk=as.matrix(cleanRelAbun.Unreg.Collapsed), ref_list=list(WyssCoray14=list(meta_ref=meta,ref_matrix = mat)  ))

#Run EnsDeconv - quick test run
params = get_params(data_type = "singlecell-rna", data_name = names(reflist$ref_list), n_markers = 50, Marker.Method = "t", TNormalization = "none", CNormalization = "none", dmeths = "CIBERSORT")
cleanRelAbun.Unreg.Resolved.WyssCoray22.CIBERSORT = EnsDeconv(count_bulk = as.matrix(cleanRelAbun.Unreg.Collapsed), ref_list = reflist$ref_list, params=params, parallel_comp=FALSE)

Error in h(simpleError(msg, call)) : error in evaluating the argument 'x' in selecting a method for function 't': NA/NaN/Inf in foreign function call (arg 2)


#pheatmap::pheatmap(cleanRelAbun.Unreg.Resolved.WyssCoray22.CIBERSORT[["EnsDeconv"]][["ensemble_p"]],cluster_rows = T, cluster_cols = F)
#cleanRelAbun.Unreg.WyssCoray22.CIBERSORT.cellTypeProportions<-cleanRelAbun.Unreg.Resolved.WyssCoray22.CIBERSORT[["EnsDeconv"]][["ensemble_p"]]

table(meta$Region)
#     Cortex Hippocampus 
#      48607       95186

#reduce cells (columns of ref_matrix) used to those only with meta$Region=="Cortex"
meta.ctx<-meta[meta$Region=="Cortex",]
dim(reflist$ref_list$WyssCoray14$ref_matrix)
#[1]  23537 143793  -- original sparse matrix dimensions
reflist$ref_list$WyssCoray14$meta_ref<-meta.ctx
#ENFORCE order of meta_ref rows with match()
reflist$ref_list$WyssCoray14$ref_matrix<-reflist$ref_list$WyssCoray14$ref_matrix[,match(meta.ctx$SamplesName,colnames(reflist$ref_list$WyssCoray14$ref_matrix))]
#recheck order
which(!match(reflist$ref_list$WyssCoray14$meta_ref$SamplesName,colnames(reflist$ref_list$WyssCoray14$ref_matrix))==c(1:ncol(reflist$ref_list$WyssCoray14$ref_matrix)))
#integer(0) #IN SAME ORDER. 
dim(reflist$ref_list$WyssCoray14$ref_matrix)
#[1] 23537 48607

cleanRelAbun.Unreg.Resolved.WyssCoray22.CIBERSORT = EnsDeconv(count_bulk = as.matrix(cleanRelAbun.Unreg.Collapsed), ref_list = reflist$ref_list, params=params, parallel_comp=FALSE)

#completes, no error.

pheatmap::pheatmap(cleanRelAbun.Unreg.Resolved.WyssCoray22.CIBERSORT[["EnsDeconv"]][["ensemble_p"]],cluster_rows = T, cluster_cols = F)
cleanRelAbun.Unreg.WyssCoray22.CIBERSORT.cellTypeProportions<-cleanRelAbun.Unreg.Resolved.WyssCoray22.CIBERSORT[["EnsDeconv"]][["ensemble_p"]]

Is there anything I can do to run with the full ref_matrix of 143,793 single nuclei profiles instead of the 48,607? The matrix holding the single nuclei profiles with gene symbols as rownames is maintained throughout as a sparse matrix.

Regards,

Eric

manqicai commented 2 years ago

Hi Eric,

Thank you for your interests in EnsDeconv. I will take a look at this and get back to you asap.

Best, Manqi

manqicai commented 2 years ago

@edammer Hi Eric,

Since I don't have the bulk data you used. I made up a bulk data for testing. testbulk <- as.matrix(mat[,1:20]) testbulk <- matrix(rnorm(470740,20),nrow = 23537) rownames(testbulk) = genes colnames(testbulk) = paste0("Sample",1:20)

reflist<-list(count_bulk=as.matrix(testbulk), ref_list=list(WyssCoray14=list(meta_ref=meta,ref_matrix = mat) ))

Run EnsDeconv - quick test run

params = get_params(data_type = "singlecell-rna", data_name = names(reflist$ref_list), n_markers = 50, Marker.Method = "t", TNormalization = "none", CNormalization = "none", dmeths = "CIBERSORT") cleanRelAbun.Unreg.Resolved.WyssCoray22.CIBERSORT = EnsDeconv(count_bulk = as.matrix(testbulk), ref_list = reflist$ref_list, params=params, parallel_comp=T,ncore = 2)

It's OK for me to successfully run this on my end. I don't have the same error message as your end. Could you please try using this testing data and let me know if this works? This won't take long. BTW just to check, may I know what's your computer configuration?

Thanks, Manqi

edammer commented 2 years ago

testbulk <- as.matrix(mat[,1:20]) testbulk <- matrix(rnorm(470740,20),nrow = 23537) rownames(testbulk) = genes colnames(testbulk) = paste0("Sample",1:20)

reflist<-list(count_bulk=as.matrix(testbulk), ref_list=list(WyssCoray14=list(meta_ref=meta,ref_matrix = mat) ))

Run EnsDeconv - quick test run params = get_params(data_type = "singlecell-rna", data_name = names(reflist$ref_list), n_markers = 50, Marker.Method = "t", TNormalization = "none", CNormalization = "none", dmeths = "CIBERSORT") cleanRelAbun.Unreg.Resolved.WyssCoray22.CIBERSORT = EnsDeconv(count_bulk = as.matrix(testbulk), ref_list = reflist$ref_list, params=params, parallel_comp=T,ncore = 2)

The above does indeed run on my system as well. My system is CentOS 7 with R Studio Server 2022.02.3 build 492 accessed through web interface, with R v4.2.0 on the back end. CPU threads available on the VM number 38, with 366 GB physical RAM mapped to the VM, plus 23 additional GB swap space available for overrun. When parallel_comp=TRUE, the memory overrun freezes the VM at about 20-25% completion over a few hours, and all threads appear at 100% use in htop from a console regardless of how many threads are specified with the ncore= parameter.

However, a single thread run (parallel_comp=F) did complete after 4 days, run on the reflist with the 48607 cortex single nuclei and metadata for 23537 genes and my proteomic data on 72 samples with 9853 protein isoforms collapsed to 9819 gene product proteins with max variance across samples. The script lines were:

params = get_params(data_type = "singlecell-rna", data_name = names(reflist$ref_list), n_markers = 50, Marker.Method = c("t", "wilcox", "combined", "none", "p.value", "regression"), TNormalization = "none", CNormalization = "none", dmeths = NULL)
cleanRelAbun.Unreg.Resolved.WyssCoray22.Ensemble = EnsDeconv(count_bulk = as.matrix(cleanRelAbun.Unreg.Collapsed), ref_list = reflist$ref_list, params=params, parallel_comp=FALSE)

Over the 4 days, the following errors appeared on the console, but they did not prevent the function from completing:

Timing stopped at: 1.936 0.285 2.218

Failed dtangle. Caught Error in intI(j, n = d[2], dn[[2]], give.dn = FALSE): invalid character indexing Failed dtangle. Caught Error in to_deconv[unlist(markers), ]: subscript out of bounds Failed hspe. Caught Error in to_deconv[unlist(markers), ]: subscript out of bounds Failed CIBERSORT. Caught Error in intI(i, n = x@Dim[1], dn[[1]], give.dn = FALSE): 'NA' indices are not (yet?) supported for sparse Matrices Failed CIBERSORT. Caught Error in intI(i, n = x@Dim[1], dn[[1]], give.dn = FALSE): 'NA' indices are not (yet?) supported for sparse Matrices Failed EPIC. Caught Error in intI(i, n = x@Dim[1], dn[[1]], give.dn = FALSE): 'NA' indices are not (yet?) supported for sparse Matrices Failed EPIC. Caught Error in intI(i, n = x@Dim[1], dn[[1]], give.dn = FALSE): 'NA' indices are not (yet?) supported for sparse Matrices Timing stopped at: 0.092 0 0.092 Failed MuSiC. Caught Error in rownames<-(*tmp*, value = rownames(count_sc)): attempt to set 'rownames' on an object with no dimensions Timing stopped at: 0.111 0 0.111 Failed MuSiC. Caught Error in rownames<-(*tmp*, value = rownames(count_sc)): attempt to set 'rownames' on an object with no dimensions Timing stopped at: 0.13 0 0.13 Failed MuSiC. Caught Error in rownames<-(*tmp*, value = rownames(count_sc)): attempt to set 'rownames' on an object with no dimensions Timing stopped at: 0.57 0 0.569 Failed MuSiC. Caught Error in rownames<-(*tmp*, value = rownames(count_sc)): attempt to set 'rownames' on an object with no dimensions Failed MuSiC. Caught Error in intI(i, n = x@Dim[1], dn[[1]], give.dn = FALSE): 'NA' indices are not (yet?) supported for sparse Matrices Timing stopped at: 0.113 0 0.112 Failed MuSiC. Caught Error in rownames<-(*tmp*, value = rownames(count_sc)): attempt to set 'rownames' on an object with no dimensions Timing stopped at: 0 0 0 Failed BisqueRNA. Caught Error in system.time(output <- switch(method_name, dtangle_GEP = dtangle::dtangle(Y = t(cbind(sig_matrix, : Method not found. Timing stopped at: 0 0 0 Failed BisqueRNA. Caught Error in system.time(output <- switch(method_name, dtangle_GEP = dtangle::dtangle(Y = t(cbind(sig_matrix, : Method not found. Timing stopped at: 0 0 0 Failed BisqueRNA. Caught Error in system.time(output <- switch(method_name, dtangle_GEP = dtangle::dtangle(Y = t(cbind(sig_matrix, : Method not found. Timing stopped at: 0 0 0 Failed BisqueRNA. Caught Error in system.time(output <- switch(method_name, dtangle_GEP = dtangle::dtangle(Y = t(cbind(sig_matrix, : Method not found. Failed BisqueRNA. Caught Error in intI(i, n = x@Dim[1], dn[[1]], give.dn = FALSE): 'NA' indices are not (yet?) supported for sparse Matrices Timing stopped at: 0 0 0 Failed BisqueRNA. Caught Error in system.time(output <- switch(method_name, dtangle_GEP = dtangle::dtangle(Y = t(cbind(sig_matrix, : Method not found. Timing stopped at: 0.004 0 0.004 Failed GEDIT. Caught Error in file(file, ifelse(append, "a", "w")): cannot open the connection Timing stopped at: 0.001 0 0.001 Failed GEDIT. Caught Error in file(file, ifelse(append, "a", "w")): cannot open the connection Timing stopped at: 0.002 0 0.002 Failed GEDIT. Caught Error in file(file, ifelse(append, "a", "w")): cannot open the connection Timing stopped at: 0.001 0 0.002 Failed GEDIT. Caught Error in file(file, ifelse(append, "a", "w")): cannot open the connection Failed GEDIT. Caught Error in intI(i, n = x@Dim[1], dn[[1]], give.dn = FALSE): 'NA' indices are not (yet?) supported for sparse Matrices Timing stopped at: 0.001 0 0.001 Failed GEDIT. Caught Error in file(file, ifelse(append, "a", "w")): cannot open the connection Timing stopped at: 0.014 0 0.015 Failed GEDIT. Caught Error in file(file, ifelse(append, "a", "w")): cannot open the connection Timing stopped at: 0.014 0 0.014 Failed GEDIT. Caught Error in file(file, ifelse(append, "a", "w")): cannot open the connection Timing stopped at: 0.013 0 0.013 Failed GEDIT. Caught Error in file(file, ifelse(append, "a", "w")): cannot open the connection Timing stopped at: 0.002 0 0.001 Failed GEDIT. Caught Error in file(file, ifelse(append, "a", "w")): cannot open the connection Failed GEDIT. Caught Error in intI(i, n = x@Dim[1], dn[[1]], give.dn = FALSE): 'NA' indices are not (yet?) supported for sparse Matrices Timing stopped at: 0.001 0 0.001 Failed GEDIT. Caught Error in file(file, ifelse(append, "a", "w")): cannot open the connection Adding 1e-5 to Z to ensure valid log transformation. Iter 1: max diff of rho: 0.74454527493055 Adding 1e-5 to Z to ensure valid log transformation. Iter 1: max diff of rho: 0.799730809057635 Adding 1e-5 to Z to ensure valid log transformation. Iter 1: max diff of rho: 0.808360133309158 Adding 1e-5 to Z to ensure valid log transformation. Iter 1: max diff of rho: 0.572251230756159 Failed ICeDT. Caught Error in intI(i, n = x@Dim[1], dn[[1]], give.dn = FALSE): 'NA' indices are not (yet?) supported for sparse Matrices Adding 1e-5 to Z to ensure valid log transformation. Iter 1: max diff of rho: 0.601934451268532 Timing stopped at: 0.039 0 0.143 Failed DeconRNASeq. Caught Error in prep(x.data, scale = "none", center = TRUE): could not find function "prep" Timing stopped at: 0.017 0 0.017 Failed DeconRNASeq. Caught Error in prep(x.data, scale = "none", center = TRUE): could not find function "prep" Timing stopped at: 0.017 0 0.018 Failed DeconRNASeq. Caught Error in prep(x.data, scale = "none", center = TRUE): could not find function "prep" Timing stopped at: 0.023 0 0.023 Failed DeconRNASeq. Caught Error in prep(x.data, scale = "none", center = TRUE): could not find function "prep" Failed DeconRNASeq. Caught Error in intI(i, n = x@Dim[1], dn[[1]], give.dn = FALSE): 'NA' indices are not (yet?) supported for sparse Matrices Timing stopped at: 0.004 0 0.004 Failed DeconRNASeq. Caught Error in prep(x.data, scale = "none", center = TRUE): could not find function "prep" Timing stopped at: 0.005 0 0.005 Failed DeconRNASeq. Caught Error in prep(x.data, scale = "none", center = TRUE): could not find function "prep" Timing stopped at: 0.005 0 0.004 Failed DeconRNASeq. Caught Error in prep(x.data, scale = "none", center = TRUE): could not find function "prep" Timing stopped at: 0.017 0 0.017 Failed DeconRNASeq. Caught Error in prep(x.data, scale = "none", center = TRUE): could not find function "prep" Timing stopped at: 0.022 0 0.022 Failed DeconRNASeq. Caught Error in prep(x.data, scale = "none", center = TRUE): could not find function "prep" Failed DeconRNASeq. Caught Error in intI(i, n = x@Dim[1], dn[[1]], give.dn = FALSE): 'NA' indices are not (yet?) supported for sparse Matrices Timing stopped at: 0.003 0.001 0.005 Failed DeconRNASeq. Caught Error in prep(x.data, scale = "none", center = TRUE): could not find function "prep" Failed FARDEEP. Caught Error in intI(i, n = x@Dim[1], dn[[1]], give.dn = FALSE): 'NA' indices are not (yet?) supported for sparse Matrices Failed FARDEEP. Caught Error in intI(i, n = x@Dim[1], dn[[1]], give.dn = FALSE): 'NA' indices are not (yet?) supported for sparse Matrices [1] "selecting only specific markers that we want to use for our analysis" [1] "running deconvolution on rownames(sig_matrix)" [1] "selecting only specific markers that we want to use for our analysis" [1] "running deconvolution on rownames(sig_matrix)" [1] "selecting only specific markers that we want to use for our analysis" [1] "running deconvolution on rownames(sig_matrix)" [1] "selecting only specific markers that we want to use for our analysis" [1] "running deconvolution on rownames(sig_matrix)" Failed DCQ. Caught Error in intI(i, n = x@Dim[1], dn[[1]], give.dn = FALSE): 'NA' indices are not (yet?) supported for sparse Matrices [1] "selecting only specific markers that we want to use for our analysis" [1] "running deconvolution on rownames(sig_matrix)" [1] "selecting only specific markers that we want to use for our analysis" [1] "running deconvolution on rownames(sig_matrix)" [1] "selecting only specific markers that we want to use for our analysis" [1] "running deconvolution on rownames(sig_matrix)" [1] "selecting only specific markers that we want to use for our analysis" [1] "running deconvolution on rownames(sig_matrix)" [1] "selecting only specific markers that we want to use for our analysis" [1] "running deconvolution on rownames(sig_matrix)" Failed DCQ. Caught Error in intI(i, n = x@Dim[1], dn[[1]], give.dn = FALSE): 'NA' indices are not (yet?) supported for sparse Matrices [1] "selecting only specific markers that we want to use for our analysis" [1] "running deconvolution on rownames(sig_matrix)" There were 48 warnings (use warnings() to see them)

There were 54 list elements in cleanRelAbun.Unreg.Resolved.WyssCoray22.Ensemble$allgene_res

length(ls(cleanRelAbun.Unreg.Resolved.WyssCoray22.Ensemble$allgene_res))
[1] 54
ls(cleanRelAbun.Unreg.Resolved.WyssCoray22.Ensemble$allgene_res)
 [1] "none_none_singlecell-rna_WyssCoray14_50_combined_linear_TRUE_CIBERSORT"  
 [2] "none_none_singlecell-rna_WyssCoray14_50_combined_linear_TRUE_DCQ"        
 [3] "none_none_singlecell-rna_WyssCoray14_50_combined_linear_TRUE_EPIC"       
 [4] "none_none_singlecell-rna_WyssCoray14_50_combined_linear_TRUE_FARDEEP"    
 [5] "none_none_singlecell-rna_WyssCoray14_50_combined_linear_TRUE_ICeDT"      
 [6] "none_none_singlecell-rna_WyssCoray14_50_combined_log_TRUE_CIBERSORT"     
 [7] "none_none_singlecell-rna_WyssCoray14_50_combined_log_TRUE_DCQ"           
 [8] "none_none_singlecell-rna_WyssCoray14_50_combined_log_TRUE_dtangle"       
 [9] "none_none_singlecell-rna_WyssCoray14_50_combined_log_TRUE_EPIC"          
[10] "none_none_singlecell-rna_WyssCoray14_50_combined_log_TRUE_FARDEEP"       
[11] "none_none_singlecell-rna_WyssCoray14_50_combined_log_TRUE_hspe"          
[12] "none_none_singlecell-rna_WyssCoray14_50_none_linear_TRUE_CIBERSORT"      
[13] "none_none_singlecell-rna_WyssCoray14_50_none_linear_TRUE_DCQ"            
[14] "none_none_singlecell-rna_WyssCoray14_50_none_linear_TRUE_EPIC"           
[15] "none_none_singlecell-rna_WyssCoray14_50_none_linear_TRUE_FARDEEP"        
[16] "none_none_singlecell-rna_WyssCoray14_50_none_linear_TRUE_ICeDT"          
[17] "none_none_singlecell-rna_WyssCoray14_50_none_log_TRUE_CIBERSORT"         
[18] "none_none_singlecell-rna_WyssCoray14_50_none_log_TRUE_DCQ"               
[19] "none_none_singlecell-rna_WyssCoray14_50_none_log_TRUE_EPIC"              
[20] "none_none_singlecell-rna_WyssCoray14_50_none_log_TRUE_FARDEEP"           
[21] "none_none_singlecell-rna_WyssCoray14_50_none_log_TRUE_hspe"              
[22] "none_none_singlecell-rna_WyssCoray14_50_regression_linear_TRUE_CIBERSORT"
[23] "none_none_singlecell-rna_WyssCoray14_50_regression_linear_TRUE_DCQ"      
[24] "none_none_singlecell-rna_WyssCoray14_50_regression_linear_TRUE_EPIC"     
[25] "none_none_singlecell-rna_WyssCoray14_50_regression_linear_TRUE_FARDEEP"  
[26] "none_none_singlecell-rna_WyssCoray14_50_regression_linear_TRUE_ICeDT"    
[27] "none_none_singlecell-rna_WyssCoray14_50_regression_log_TRUE_CIBERSORT"   
[28] "none_none_singlecell-rna_WyssCoray14_50_regression_log_TRUE_DCQ"         
[29] "none_none_singlecell-rna_WyssCoray14_50_regression_log_TRUE_dtangle"     
[30] "none_none_singlecell-rna_WyssCoray14_50_regression_log_TRUE_EPIC"        
[31] "none_none_singlecell-rna_WyssCoray14_50_regression_log_TRUE_FARDEEP"     
[32] "none_none_singlecell-rna_WyssCoray14_50_regression_log_TRUE_hspe"        
[33] "none_none_singlecell-rna_WyssCoray14_50_t_linear_TRUE_CIBERSORT"         
[34] "none_none_singlecell-rna_WyssCoray14_50_t_linear_TRUE_DCQ"               
[35] "none_none_singlecell-rna_WyssCoray14_50_t_linear_TRUE_EPIC"              
[36] "none_none_singlecell-rna_WyssCoray14_50_t_linear_TRUE_FARDEEP"           
[37] "none_none_singlecell-rna_WyssCoray14_50_t_linear_TRUE_ICeDT"             
[38] "none_none_singlecell-rna_WyssCoray14_50_t_log_TRUE_CIBERSORT"            
[39] "none_none_singlecell-rna_WyssCoray14_50_t_log_TRUE_DCQ"                  
[40] "none_none_singlecell-rna_WyssCoray14_50_t_log_TRUE_dtangle"              
[41] "none_none_singlecell-rna_WyssCoray14_50_t_log_TRUE_EPIC"                 
[42] "none_none_singlecell-rna_WyssCoray14_50_t_log_TRUE_FARDEEP"              
[43] "none_none_singlecell-rna_WyssCoray14_50_t_log_TRUE_hspe"                 
[44] "none_none_singlecell-rna_WyssCoray14_50_wilcox_linear_TRUE_CIBERSORT"    
[45] "none_none_singlecell-rna_WyssCoray14_50_wilcox_linear_TRUE_DCQ"          
[46] "none_none_singlecell-rna_WyssCoray14_50_wilcox_linear_TRUE_EPIC"         
[47] "none_none_singlecell-rna_WyssCoray14_50_wilcox_linear_TRUE_FARDEEP"      
[48] "none_none_singlecell-rna_WyssCoray14_50_wilcox_linear_TRUE_ICeDT"        
[49] "none_none_singlecell-rna_WyssCoray14_50_wilcox_log_TRUE_CIBERSORT"       
[50] "none_none_singlecell-rna_WyssCoray14_50_wilcox_log_TRUE_DCQ"             
[51] "none_none_singlecell-rna_WyssCoray14_50_wilcox_log_TRUE_dtangle"         
[52] "none_none_singlecell-rna_WyssCoray14_50_wilcox_log_TRUE_EPIC"            
[53] "none_none_singlecell-rna_WyssCoray14_50_wilcox_log_TRUE_FARDEEP"         
[54] "none_none_singlecell-rna_WyssCoray14_50_wilcox_log_TRUE_hspe" 

Therefore, based on the errors caught and the successful combinations, certain combinations of algorithm and statistic did NOT successfully run and contribute to the ensemble results, as follows:

I believe the failures are probably due to missing values in the bulk proteomic data. There were 4 batches of TMT proteomic data with 18 samples in each batch. Batchwise missingness in rows (genes) is controlled to 1 less than 50% or fewer (i.e. 35 or less of the 72 samples), and missing values are specified by a formal NA in the matrix of proteomic counts.

I think this post gets to the heart of the issue -- missing value handling. Any thoughts on how best to move forward on a couple of fronts? First, there is the possibility of using the ensemble of method combinations that does run. Now that we know what runs, should I just specify the algorithms and statistics that do not fail? Will this possibly run in less time and with less memory with ncores>1, since none of the failed iterations will be run? Second, would there be a recommendation for handling missing values (e.g. with na.omit, or impute::impute.knn()?) In this case, dim(na.omit(reflist$count_bulk)) keeps 8383 genes with no missing values across the 72 samples.

I am glad to share the bulk protein count matrix for you to test strategies for handling such data better. The EnsDeconv package is going to be very valuable for accurate cell type proportion estimates in brain tissue, and I would very much like to continue using it in our bulk brain proteomics.

manqicai commented 2 years ago

@edammer Hi Eric,

It would be great if you could share the bulk data you used for me investigate such situation further. Thank you for valuable info and interests in our algorithm. I need to investigate more about our method using in the Linux system and get back to you with more info.

edammer commented 2 years ago

Banner_frontalCortex_bulkProteomicCounts.zip The attached .ZIP is of a .RData file containing 2 variables: BannerTraits, BannerBulkProteinCounts

@manqicai Manqi, Here is a published bulk frontal cortex proteomic count matrix for Banner TMT (MS2)-quantified protein counts assembled from identified and quantified peptides in a typical bottom-up proteomics discovery mode experiment. It is 201 case samples, plus one or more all-sample physical average global internal standard (GIS) mixture samples in each batch. The GIS sample proportions will likely all come out very similar, clustering together as a sanity check. The data have been batch corrected and controlled for missing values to less than 50% of each row. The publication with this data is Johnson, et al, Nat Neurosci, 2022.

manqicai commented 2 years ago

@edammer Great, Thanks! I will take a look at this and get back to you soon.

edammer commented 2 years ago

Thank you. I have a new observation -- the quick CIBERSORT log and linear 2 combination ensemble that runs so quickly, only works for me with n_markers=50 in the params. All other settings tried cause the original error to reappear. This is running EnsDeconv installed on a Windows 10 machine (R v4.0.2, 64 GB RAM, 32 threads available), on the same 72 sample bulk data with ~9000 gene product proteins and the Wyss-Coray 14 single nuclei types (N=48607 nuclei from cortex only).

Screenshot: image

Edit: The settings tried were all for 25 or fewer markers. 30 markers minimum does still complete.

edammer commented 2 years ago

A technical note about the Wyss-Coray single nuclei data. They have some cell types that are not expected in cleanly dissected frontal cortex of brain. image

There are only 54 cortex ependymal single nuclei in Wyss-Coray's data. With marker selection type "t" as the only method used, this cell type takes over GFAP (which is ependymal/ventricular lining expressed) as a marker, and throws off cell type proportions gravely in another bulk cortex tissue proteomic count data set of 384 samples:

image

We know we carefully dissect tissue to avoid unnecessary heterogeniety contributing to our datasets for both differential and co-expression analyses. Therefore, the minor, sub 1%, of cell nuclei that end up looking like a relatively large proportion in bulk test runs with CIBERSORT should not be considered in estimation of other cell type proportions in a final run. I took out meningeal fibroblasts and ependyma on this basis, and now get the following heat map, which is much more reasonable:

image

manqicai commented 2 years ago

Hi Eric, @edammer

I want to thank you again for your patience! I am so sorry for the delayed response on this. For the missingness of the bulk data BannerBulkProteinCounts. I have the following suggestions for you. I checked the missingness pattern of this data, it turns out that the missingness patterns are consistent with batches. For example, if you only look at batch 1, which you can identify by BannerTraits$Batch == b01, there are 350 proteins containing only NAs. Similar patterns can be observed in batches like b02, b04, b09, b10, etc. For other batches, most missing proteins still contain only NAs, for only 1-3 proteins contain only one NA across samples. Since deconvolution is done sample by sample. I suggested the following way to handle your problem:

  1. Separate the original data into 22 sub-data by batches
  2. Remove proteins containing only NA values
  3. Perform mean imputation or other imputation, you like to make the bulk data a complete data.

I wrote some sample code below to get the final data. Let me know if anything does not make sense to you.

# subset data contain NAs
data_check <- BannerBulkProteinCounts[which(is.na(rowSums(BannerBulkProteinCounts))),]

# check bactch data
batches <- unique(BannerTraits$Batch)
for (i in batches) {
  data_sub <-  data_check[,which(BannerTraits$Batch == i)]
  print(table(apply(data_sub, 1, function(x) sum(is.na(x)))))
}

table(BannerTraits$Batch)

BannerBulkProteinCounts_complete <- lapply(batches, function(i){
  data_sub <-  BannerBulkProteinCounts[,which(BannerTraits$Batch == i)]
  # Find protein whose values are all NAs, and remove them
  tmp <- apply(data_sub, 1, function(x) sum(is.na(x)))
  data_complete <- data_sub[which(tmp!=11),]
  return(data_complete)
})

#Check missingness
lapply(BannerBulkProteinCounts_complete, function(x){
  which(is.na(rowSums(x)))
})

# Use Mean imputation below to impute those missing values.
#install.packages("missMethods")
library(missMethods)

BannerBulkProteinCounts_complete <-  lapply(BannerBulkProteinCounts_complete, function(x){
  impute_mean(x, type = "rowwise")
})
#Check missingness
lapply(BannerBulkProteinCounts_complete, function(x){
  which(is.na(rowSums(x)))
})

# Deconvolve each element in BannerBulkProteinCounts_complete separately.
# Example code, need to modify if needed
# final_result <- lapply(BannerBulkProteinCounts_complete ,function(bulk){
#  res <- EnsDeconv(count_bulk = as.matrix(bulk), ref_list = reflist$ref_list, params=params, parallel_comp=FALSE)
#  return(res)
#} )