Closed MingLi-929 closed 1 year ago
1. 使用tidyr::pivot_longer()直接处理矩阵
2. 使用Matrix将矩阵转为dgRMatrix的稀疏矩阵---->提前生成全为0的矩阵----->根据稀疏矩阵得到行列index替换全为0的长矩阵(这个长矩阵与直接pivot_longer()转化的长矩阵相同)
3. 使用Matrix,将矩阵转为dgRMatrix的稀疏矩阵---->根据行列index直接生成长矩阵(这个长矩阵会比直接pivot_longer()转化的长矩阵减少,减少数值为0的)
使用microbenchmark对三种方式进行测试
library("microbenchmark")
library(Matrix)
library(TxDb.Hsapiens.UCSC.hg19.knownGene)
files <- getSampleFiles()
txdb <- TxDb.Hsapiens.UCSC.hg19.knownGene
mt <- getTagMatrix(peak = files[[5]],TxDb = txdb,
upstream = 1000,downstream = 1000,
type = "start_site",by = "gene",
weightCol = "V5")
mt <- t(apply(mt, 1, function(x) x/max(x)))
ii <- order(rowSums(mt))
mt <- mt[ii,]
colnames(mt) <- seq_len(dim(mt)[2])
rownames(mt) <- seq_len(dim(mt)[1])
## 第一种直接转化
mt_1<-mt
mt_1 <- mt_1 %>% as.data.frame() %>%
rownames_to_column("sample_ID") %>%
pivot_longer(-c(sample_ID),names_to = "coordinate",
values_to = "values")
mt_1$coordinate <- as.numeric(mt_1$coordinate)
mt_1$sample_ID <- as.numeric(mt_1$sample_ID)
## 第二种生成全0矩阵后进行替换
mt_2 <- as(mt, 'dgRMatrix')
mt_2_df <- data.frame(sample_ID=rep(seq_len(mt_2@Dim[1]),rep(mt_2@Dim[2],mt_2@Dim[1])),
coordinate=rep(seq_len(mt_2@Dim[2]),mt_2@Dim[1]),
values.1=rep(0,mt_2@Dim[1]*mt_2@Dim[2]))
col_pos <- mt_2@j+1
row_pos <- findInterval(seq(mt_2@x)-1,mt_2@p[-1])+1
df <- data.frame(sample_ID=row_pos,coordinate=col_pos,values=mt_sp@x)
merge_df <- merge(mt_2_df,df,all=TRUE,)[,c(1,2,4)]
merge_df[is.na(merge_df)] <- 0
## 第三种由稀疏矩阵直接生成不带有0的长矩阵
mt_3 <- as(mt, 'dgRMatrix')
col_pos <- mt_3@j+1
row_pos <- findInterval(seq(mt_3@x)-1,mt_3@p[-1])+1
mt_3_sp <- data.frame(sample_ID=row_pos,coordinate=col_pos,values=mt_3@x)
进行测试
# 对运行速度进行测试
func1 <- function(){
mt_1<-mt
mt_1 <- mt_1 %>% as.data.frame() %>%
rownames_to_column("sample_ID") %>%
pivot_longer(-c(sample_ID),names_to = "coordinate",
values_to = "values")
mt_1$coordinate <- as.numeric(mt_1$coordinate)
mt_1$sample_ID <- as.numeric(mt_1$sample_ID)
}
func2 <- function(){
mt_2 <- as(mt, 'dgRMatrix')
mt_2_df <- data.frame(sample_ID=rep(seq_len(mt_2@Dim[1]),rep(mt_2@Dim[2],mt_2@Dim[1])),
coordinate=rep(seq_len(mt_2@Dim[2]),mt_2@Dim[1]),
values.1=rep(0,mt_2@Dim[1]*mt_2@Dim[2]))
col_pos <- mt_2@j+1
row_pos <- findInterval(seq(mt_2@x)-1,mt_2@p[-1])+1
df <- data.frame(sample_ID=row_pos,coordinate=col_pos,values=mt_sp@x)
merge_df <- merge(mt_2_df,df,all=TRUE,)[,c(1,2,4)]
merge_df[is.na(merge_df)] <- 0
}
func3 <- function(){
mt_3 <- as(mt, 'dgRMatrix')
col_pos <- mt_3@j+1
row_pos <- findInterval(seq(mt_3@x)-1,mt_3@p[-1])+1
mt_3_sp <- data.frame(sample_ID=row_pos,coordinate=col_pos,values=mt_3@x)
}
bench_res <- microbenchmark("use tidyr"=func1(),
"use sparse matrix build long matrix with 0"=func2(),
"use sparse matrix build long matrix"=func3())
得到的测试结果
> bench_res
Unit: milliseconds
expr min lq mean
use tidyr 448.3222 462.9606 496.77029
use sparse matrix build long matrix with 0 4312.1044 4455.9427 4640.16852
use sparse matrix build long matrix 25.3467 29.2169 39.21166
median uq max neval
467.52880 541.4086 720.7547 100
4539.81870 4695.4377 5655.6305 100
30.37515 32.6257 171.1232 100
得到如下结论
根据上面可以知道
bench_res_tidyr <- microbenchmark(matrix=func1(),
plot=func4(mt_1))
bench_res_sparse <- microbenchmark(matrix=func3(),
plot=func4(mt_3_sp))
测试结果如下所示
> bench_res_tidyr
Unit: milliseconds
expr min lq mean median uq max neval
matrix 466.1613 474.6662 507.455975 485.5752 524.41290 1134.3673 100
plot 2.3244 2.4761 2.907646 2.8150 2.98015 8.0449 100
> bench_res_sparse
Unit: milliseconds
expr min lq mean median uq max neval
matrix 26.3806 27.7055 34.810995 29.20545 35.9749 189.8263 100
plot 2.3634 2.5708 2.927292 2.89770 3.0372 5.6479 100
可以得出结论:
进行绘制热图时候,准备矩阵的时间都在90%以上
使用稀疏矩阵进行处理矩阵能够大幅度减少运行时间
使用tidyr::pivot_longer()直接转化矩阵得到的热图 使用稀疏矩阵构建长矩阵的绘图
总结: 两者区别就在于,使用稀疏矩阵直接构建的长矩阵绘图没有给0配置颜色,直接显示空白。
对三种方式进行测试
library(microbenchmark)
library(Matrix)
library(TxDb.Hsapiens.UCSC.hg19.knownGene)
library(ChIPseeker)
files <- getSampleFiles()
txdb <- TxDb.Hsapiens.UCSC.hg19.knownGene
mt <- getTagMatrix(peak = files[[5]],TxDb = txdb,
upstream = 1000,downstream = 1000,
type = "start_site",by = "gene",
weightCol = "V5")
mt <- t(apply(mt, 1, function(x) x/max(x)))
ii <- order(rowSums(mt))
mt <- mt[ii,]
colnames(mt) <- seq_len(dim(mt)[2])
rownames(mt) <- seq_len(dim(mt)[1])
func1 <- function(tagMatrix){
tagMatrix <- tagMatrix %>% as.data.frame() %>%
rownames_to_column("sample_ID") %>%
pivot_longer(-c(sample_ID),names_to = "coordinate",
values_to = "values")
tagMatrix$coordinate <- as.numeric(tagMatrix$coordinate)
tagMatrix$sample_ID <- as.numeric(tagMatrix$sample_ID)
return(tagMatrix)
}
func2 <- function(tagMatrix){
sparse <- as(tagMatrix, 'dgRMatrix')
col_pos <- sparse@j+1
row_pos <- findInterval(seq(sparse@x)-1,sparse@p[-1])+1
df <- data.frame(sample_ID=row_pos,coordinate=col_pos,values=sparse@x)
return(df)
}
func3 <- function(tagMatrix){
df <- data.frame(sample_ID = rep(1:nrow(tagMatrix), ncol(tagMatrix)),
coordinate = rep(1:ncol(tagMatrix), each = nrow(tagMatrix)),
values = as.vector(tagMatrix))
return(df)
}
bench_res <- microbenchmark(pivot_longer=func1(mt),
sparse=func2(mt),
df =func3(mt))
下面是对三种方式进行测试的结果
> bench_res
Unit: milliseconds
expr min lq mean median uq max
pivot_longer 620.3364 665.5240 704.41639 682.45545 707.81480 1255.8005
sparse 29.7513 35.9059 43.25461 40.26945 44.77245 123.5209
df 31.2031 34.7932 40.62718 37.94055 42.53955 110.7242
neval
100
100
100
我还希望看到你现有的ggplot2版本和原来的base版本的速度差别。
另外,你现在的这个PR,已经都搞定可以merge了吗?
另外,你现在的这个PR,已经都搞定可以merge了吗?
还没有换成您说的那种方法,我想着把测试结果都放出来,等您拿主意能否使用这那种方法。
我还希望看到你现有的ggplot2版本和原来的base版本的速度差别。
我对三种方法放在一起测试:
使用ggplot2画图,使用pivot_longer
func1 <- function(tagMatrix){
cols <- colorRampPalette(c("white","red"))(200)
xlim <- 1:ncol(tagMatrix)
png("../../test.png",width = 500,height = 1000)
image(x=xlim, y=1:nrow(tagMatrix),z=t(tagMatrix),
useRaster=TRUE, col=cols, yaxt="n", ylab="",
xlab="", main="")
dev.off()
}
func2 <- function(tagMatrix){
df <- data.frame(sample_ID = rep(1:nrow(tagMatrix), ncol(tagMatrix)), coordinate = rep(1:ncol(tagMatrix), each = nrow(tagMatrix)), values = as.vector(tagMatrix))
p <- ggplot(df, aes(x = coordinate,y = sample_ID)) + geom_tile(aes(fill = values)) + scale_fill_distiller(palette = "RdBu") + theme(axis.text.y=element_blank(), axis.ticks.y=element_blank(), axis.line.y = element_blank(), panel.grid=element_blank(), panel.background = element_blank()) + labs(x = "", y = "", title = "")
png("../../test.png",width = 500,height = 1000) print(p) dev.off()
}
func3 <- function(tagMatrix){
tagMatrix <- tagMatrix %>% as.data.frame() %>% rownames_to_column("sample_ID") %>% pivot_longer(-c(sample_ID),names_to = "coordinate", values_to = "values") tagMatrix$coordinate <- as.numeric(tagMatrix$coordinate) tagMatrix$sample_ID <- as.numeric(tagMatrix$sample_ID)
p <- ggplot(tagMatrix, aes(x = coordinate,y = sample_ID)) + geom_tile(aes(fill = values)) + scale_fill_distiller(palette = "RdBu") + theme(axis.text.y=element_blank(), axis.ticks.y=element_blank(), axis.line.y = element_blank(), panel.grid=element_blank(), panel.background = element_blank()) + labs(x = "", y = "", title = "")
png("../../test.png",width = 500,height = 1000) print(p) dev.off()
}
bench_res <- microbenchmark(base=func1(mt), ggplot=func2(mt), ggplot_tidyr=func3(mt))
得到测试结果
bench_res Unit: milliseconds expr min lq mean median uq base 125.5756 138.4526 196.0506 152.6029 162.7903 ggplot 5533.3312 5892.1336 6125.6924 5993.4507 6328.8083 ggplot_tidyr 5934.9380 6396.3093 6619.5268 6462.2576 6818.9751 max neval 737.5442 100 7592.5463 100 8181.8756 100
![time](https://user-images.githubusercontent.com/78794151/208698407-68eb79d8-8a45-41b0-859a-c05c0396107e.png) 得到以下结论: 1. 使用base画法是远远快过ggplot画法的 2. 使用直接把matrix当成vector转换矩阵,可以明显的发现快于使用pivot_longer,但是ggplot的画图速度还是要慢于base
为了,探究为什么ggplot画图慢,我测试了画图流程,一共测试以下三个步骤:
将对象渲染到画板上
func1 <- function(tagMatrix){
df <- data.frame(sample_ID = rep(1:nrow(tagMatrix), ncol(tagMatrix)),
coordinate = rep(1:ncol(tagMatrix), each = nrow(tagMatrix)),
values = as.vector(tagMatrix))
return(df)
}
func2 <- function(df){ p <- ggplot(df, aes(x = coordinate,y = sample_ID)) + geom_tile(aes(fill = values)) + scale_fill_distiller(palette = "RdBu") + theme(axis.text.y=element_blank(), axis.ticks.y=element_blank(), axis.line.y = element_blank(), panel.grid=element_blank(), panel.background = element_blank()) + labs(x = "", y = "", title = "")
}
func3 <- function(p){ png("../../test.png",width = 500,height = 1000) print(p) dev.off() }
bench_plot <- microbenchmark(transform_mt=func1(mt), make_ggplot=func2(df), print_ggplot=func3(p))
得到以下测试结果
bench_plot Unit: milliseconds expr min lq mean median uq transform_mt 26.5715 27.2039 35.773324 27.68520 29.01655 make_ggplot 2.3230 2.5289 2.858322 2.83215 2.97820 print_ggplot 5375.0542 5830.7697 6000.328077 5900.53445 6259.06390 max neval 529.7091 100 8.5080 100 7029.7743 100
![time](https://user-images.githubusercontent.com/78794151/208699311-6b633563-d656-4afd-ae81-1d5f6db34e6d.png) 得到以下结论: 1. 使用ggplot绘图中,将对象渲染到画板上的时间是耗时最长的,这也是它比base慢的主要原因 2. 使用接把matrix当成vector转换矩阵的方法,代替,使用pivot_longer,可以很明显的缩短准备矩阵的时间,快一个数量级。 3. 尽管缩短了处理矩阵的时间,但是由于渲染ggplot对象的耗时过长,所以整个过程还是要比base要慢
用把matrix当vector的方法 ,可以调用yulab.utils::mat2df, https://github.com/YuLab-SMU/yulab.utils/commit/9f7561f05a4f296a0c1247e474af0e7cb6a637c1
ggplot2渲染慢这个,你如果有兴趣的话,可以看一下,https://github.com/exaexa/scattermore
另一个思路我之前也一直提到,那就是用bin,减少数据分辨率。
另一个思路我之前也一直提到,那就是用bin,减少数据分辨率。
这个方法,提供了bin参数供用户使用
这个版本已经采用了把matrix当vector的方法,可以merge
This pr change the way of plotting heatmap from base system to ggplot system. It adds the function of gradient color by giving
palette
parameter, which is the palette of gradient color.There are two ways to plot the heatmap. The first is giving a created matrix to
tagHeatmap()
fucntion.The second is to use
peakHeatmap()
to do all things in one steps.If users wanna to speed up the process, a binning method can be used by specifing the
nbin
.Since this method use ggplot system, users can use ggplot method to change
a list of peaks can also be plotted.
plot body region
peakHeatmap(peak = files[[4]], TxDb = txdb, upstream = NULL, downstream = NULL, by = "gene", type = "body", nbin = 800)
extend body region with rel object
peakHeatmap(peak = files[[4]], TxDb = txdb, upstream = rel(0.2), downstream = rel(0.2), by = "gene", type = "body", nbin = 800)
extend body region with actual number
peakHeatmap(peak = files[[4]], TxDb = txdb, upstream = 1000, downstream = 1000, by = "gene", type = "body", nbin = 800)
txdb1 <- transcripts(TxDb.Hsapiens.UCSC.hg19.knownGene) txdb2 <- unlist(fiveUTRsByTranscript(TxDb.Hsapiens.UCSC.hg19.knownGene))
txdb <- list(gene19 = txdb1, geneX = txdb2) peakHeatmap_multiple_Sets(peak = files[[4]], upstream = 1000,downstream = 1000, by = c("gene19","geneX"), type = "start_site", TxDb = txdb,nbin = 800)
sample <- list(files[[4]],files[[5]]) peakHeatmap_multiple_Sets(peak = files[4:5], upstream = 1000, downstream = 1000, by = c("gene19","geneX"),type = "start_site", TxDb = txdb,nbin = 800)
txdb1 <- transcripts(TxDb.Hsapiens.UCSC.hg19.knownGene) txdb2 <- unlist(fiveUTRsByTranscript(TxDb.Hsapiens.UCSC.hg19.knownGene))[1:10000,]
txdb <- list(gene19 = txdb1, geneX = txdb2) peakHeatmap_multiple_Sets(peak = files[[4]], upstream = 1000,downstream = 1000, by = c("gene19","geneX"), type = "start_site", TxDb = txdb,nbin = 800)
peak_Profile_Heatmap(peak = files[[4]], upstream = 1000, downstream = 1000, by = "gene", type = "start_site", TxDb = txdb3, nbin = 800)
peak_Profile_Heatmap(peak = files[[4]], upstream = 1000, downstream = 1000, by = c("gene19","geneX"), type = "start_site", TxDb = txdb,nbin = 800)
peak_Profile_Heatmap(peak = files[4:5], upstream = 1000, downstream = 1000, by = c("gene19","geneX"), type = "start_site", TxDb = txdb,nbin = 800)