代谢基因和lncRNA表达量矩阵的相关性

代谢基因和lncRNA表达量矩阵的相关性 by 生信菜鸟团

上两讲中，我们拿到了长非编码基因的基因集和表达量矩阵，以及TCGA数据挖掘常见基因集合，见：把转录组测序表达量矩阵拆分成为编码与非编码、TCGA的XENA的转录组测序表达量矩阵预处理，这次我们用lncRNA和代谢相关基因的表达量矩阵做相关性计算，并解析一下它的结果。

1. 获得表达矩阵

从LAML表达量矩阵拿出lncRNA和代谢相关基因的表达量矩阵：

#lncRNA表达矩阵
load( file = 'output/rdata/0.lnc_protein_expression_data.Rdata')
lnc_dat[1:4, 1:4]
#             TCGA-AB-2949-03 TCGA-AB-2918-03 TCGA-AB-2943-03 TCGA-AB-2851-03
# ZSWIM8-AS1               15              10              10              14
# ZSCAN16-AS1             277             282             216             163
# ZRANB2-AS1               36              42              23              26
# ZNRD1-AS1              1880            1561            1744             929

#metabolism代谢表达矩阵
genes_target <- read.table('input/metabolism-genes-list.txt')[,1] 
kp <- intersect(rownames(dat), genes_target)
target_dat <- dat[kp,]
target_dat[1:4, 1:4]
#          TCGA-AB-2949-03 TCGA-AB-2918-03 TCGA-AB-2943-03 TCGA-AB-2851-03
# ZACN                 104             103             128             117
# XYLT2                919            1633             839            1305
# XDH                    1               3               1               1
# VKORC1L1            1578            1712            1436             954

save(lnc_dat, 
     genes_target, 
     file = 'output/rdata/0.lnc_metabolism_expression_data.Rdata')

2. 计算相关性

load( file = 'output/rdata/0.lnc_metabolism_expression_data.Rdata')
identical(colnames(lnc_dat), colnames(target_dat))
#[1] TRUE

m1=target_dat  #代谢
m2=lnc_dat

#定义计算相关性的函数
cor_2_matrix <- function(m1,m2){
  apply(m2 , 1, function(x){
    #  x = m2[2,]
    unlist(apply(m1, 1,function(y){
      # y  = m1[1,]
      cor(as.numeric(x), #cor
          as.numeric(y))
    }))
  })
}

rdf[1:4,1:4]
#           ZSWIM8-AS1 ZSCAN16-AS1  ZRANB2-AS1   ZNRD1-AS1
#ZACN      0.006748374  0.02089440  0.09556742  0.31465553
#XYLT2     0.215282382  0.27156587 -0.01498975  0.19231130
#XDH      -0.062246960 -0.04867204  0.05621079 -0.02622855
#VKORC1L1 -0.272115650  0.15826855  0.24478046  0.35947612

if(F){  #运行时间比较久，下面保存结果，避免重复运行
  rdf=cor_2_matrix( m1  , m2 ) 
}

#得到相关性p值的矩阵
corP_2_matrix <- function(m1,m2){
  apply(m2 , 1, function(x){
    #  x = m2[2,]
    unlist(apply(m1, 1,function(y){
      # y  = m1[1,]
      cor.test(as.numeric(x),          #cor.test
               as.numeric(y))$p.value  #cor.test()$p.value
    }))
  })
}

if(F){
  pdf=corP_2_matrix( m1 , m2 ) 
}

pdf[1:4,1:4]
#           ZSWIM8-AS1 ZSCAN16-AS1  ZRANB2-AS1    ZNRD1-AS1
#ZACN     0.9344580531 0.798994257 0.243102469 8.325045e-05
#XYLT2    0.0079389954 0.000743695 0.855051633 1.799850e-02
#XDH      0.4476827269 0.552864022 0.493008150 7.492120e-01
#VKORC1L1 0.0007247521 0.052267187 0.002452533 5.813088e-06

if(F){
save(rdf,pdf,file = 'output/rdata/step1_cor_results1.Rdata')
}

设置阈值

cor_thred <- 0.7            
pvalue_thred  <- 0.01

library(reshape2)
pdfm = melt(pdf) #将数据宽边长
head(pdfm)
#      Var1       Var2        value
#1     ZACN ZSWIM8-AS1 0.9344580531
#2    XYLT2 ZSWIM8-AS1 0.0079389954
#3      XDH ZSWIM8-AS1 0.4476827269
#4 VKORC1L1 ZSWIM8-AS1 0.0007247521
#5    VDAC2 ZSWIM8-AS1 0.3495754229
#6    VDAC1 ZSWIM8-AS1 0.0120853025

rdfm = melt(rdf)
head(rdfm)
#      Var1       Var2        value
#1     ZACN ZSWIM8-AS1  0.006748374
#2    XYLT2 ZSWIM8-AS1  0.215282382
#3      XDH ZSWIM8-AS1 -0.062246960
#4 VKORC1L1 ZSWIM8-AS1 -0.272115650
#5    VDAC2 ZSWIM8-AS1  0.076647718
#6    VDAC1 ZSWIM8-AS1 -0.203777611

table(pdfm$value  < pvalue_thred )
#   FALSE    TRUE 
# 5875071 1128795 

table(abs(rdfm$value) > cor_thred )
#   FALSE    TRUE 
# 7001719    2147 

#设置筛选条件
kp = (pdfm$value  < pvalue_thred ) & (abs(rdfm$value) > cor_thred )
table(kp)
#   FALSE    TRUE 
# 7001719    2147

cor_df =  rdfm[kp,]
head(cor_df)
#         Var1        Var2     value
#15333 SLC38A4 ZNF385D-AS2 0.7822774
#29392   EPHX2    ZIM2-AS1 0.7123947
#29443 CYP4F12    ZIM2-AS1 0.7473137
#29500    CDO1    ZIM2-AS1 0.8504875
#29666  ABCB11    ZIM2-AS1 0.7221485
#38112  HS3ST2  ZBTB20-AS3 0.7256150

colnames(cor_df)[1:2]=c('target','lncRNA' )
cor_df$Regulation = ifelse(cor_df$value > 0 ,'pos','neg')
table(cor_df$Regulation ) #此时相关性全是pos正相关
# pos 
#2147 

#save data
save(cor_df,file = 'output/rdata/step1_cor_results.Rdata')
length(unique(cor_df$lncRNA))

3. 可视化

library(ggalluvial)
library(ggplot2)
library(ggsci)
library(RColorBrewer)

data <- cor_df
length(unique(data$target))
#[1] 271
length(unique(data$lncRNA))
#[1] 775
head(data)
#       target      lncRNA     value Regulation
#15333 SLC38A4 ZNF385D-AS2 0.7822774        pos
#29392   EPHX2    ZIM2-AS1 0.7123947        pos
#29443 CYP4F12    ZIM2-AS1 0.7473137        pos
#29500    CDO1    ZIM2-AS1 0.8504875        pos
#29666  ABCB11    ZIM2-AS1 0.7221485        pos
#38112  HS3ST2  ZBTB20-AS3 0.7256150        pos

colourCount <- length(unique(data$target))
getPalette <- colorRampPalette(brewer.pal(12, "Paired"))

3.1 ggalluvium

ggplot(data, aes(axis1 = lncRNA, axis2 = target,
               axis3 = Regulation,
               y = value)) +
scale_x_discrete(limits = c("lncRNA", "target", "Regulation"), expand = c(.2, .05))+
xlab("CorGraph") +
geom_alluvium(aes(fill = Regulation),width = 1/12, alpha = 1, knot.pos = 0.1, reverse = FALSE)+
scale_fill_aaas()+
geom_stratum(width= 1/12, reverse = FALSE)+
# geom_text(stat = "stratum", aes(label = after_stat(stratum)))+
theme_bw() + 
theme(axis.line = element_blank(),
      axis.ticks = element_blank(),
      axis.text.y = element_blank(),
      panel.grid = element_blank(),
      panel.border = element_blank())+ 
ggtitle("Correlation between lncRNAs and target genes",
        "stratified by regulation") 
ggsave( filename = 'output/plot/step1.doCor_ggalluvium.pdf'), 
      width = 10, height = 6)

可以看到在我们选定的阈值下，相关性都为正相关。

3.2 barplot

查看的是代谢基因（横坐标）所对应的lncRNA的数量（纵坐标）。

ggplot(data) + 
geom_bar(aes(factor(target), fill=factor(target))) + 
scale_fill_manual(values = getPalette(colourCount))+
xlab("") + ylab("")+
theme_bw()+
theme(axis.text.x = element_text(angle = 45, vjust = 0.6))

ggsave( filename = 'output/plot/step1.doCor_barplot.pdf',   
      width = 10, height = 8)

colnames(data)
ggplot(data) + 
geom_bar(aes(factor(target), fill=factor(target))) + 
scale_fill_manual(values = getPalette(colourCount))+
xlab("") + ylab("")+
theme_bw()+
theme(axis.text.x = element_text(angle = 45, vjust = 0.6)) + facet_wrap(~Regulation,nrow=2)  #加上正负相关性的分组，不过这里只有正相关

ggsave(filename = 'output/plot/step1.doCor_barplot_split.pdf',
     width = 10, height = 8)

所有基因的基因名图例非常多，下面基因名也看不清了。

挑选相关性前一百个基因看一下。但其实对应的lncRNA会不全，没有什么意义。

#相关性前100个基因（有重复）
data1 <- data[order(data$value,decreasing = T),]
data1 = data[1:100,]

colourCount <- length(unique(data1$target))

ggplot(data1) + 
  geom_bar(aes(factor(target), fill=factor(target))) + 
  scale_fill_manual(values = getPalette(colourCount))+
  xlab("") + ylab("")+
  theme_bw()+
  theme(axis.text.x = element_text(angle = 45, vjust = 0.6)) + facet_wrap(~Regulation,nrow=2)
ggsave(filename = 'output/plot/step1.100doCor_barplot_split.png'),
       width = 20, height = 12)

那看一下对应lncRNA数量前50的基因：

a=names(sort(table(data$target),decreasing = T)[1:50])
data2=data[data$target%in%a,]

colourCount <- length(unique(data2$target))

ggplot(data2) + 
  geom_bar(aes(factor(target), fill=factor(target))) + 
  scale_fill_manual(values = getPalette(colourCount))+
  xlab("") + ylab("")+
  theme_bw()+
  theme(axis.text.x = element_text(angle = 45, vjust = 0.6))
ggsave(filename = paste0(pro,'_','output/plot/step1.50doCor_barplot_split.png'),
       width = 20, height = 12)

ixxmu / mp_duty

代谢基因和lncRNA表达量矩阵的相关性 #2784