Closed ixxmu closed 3 months ago
如何能够像网页工具一样,仅输入基因名称,通过一套代码就可以获得基因表达量在不同样本中的表达量差异图,基因表达量差异与临床病理参数之间的关系图以及表达量差异与生存的关系图呢?
生信菜鸟团既往内容链接:
根据TMN以及stage和grade等临床信息查看TCGA的LIHC表达量矩阵的SLC25A11基因情况
这次曾老师就交代了这样的作业,希望能够只输入基因名就能把A-E的图给呈现出来。
PMID:37789080
rm(list=ls())
#打破下载时间的限制,改前60秒,改后1亿秒
options(timeout = 100000000)
options(scipen = 20)#不要以科学计数法表示
library(tidyverse)
library(data.table)
proj = "TCGA-KIRC"
#下载内容之前需要正确填写destfile的位置哦
if(T){
download.file(url = paste0("https://gdc.xenahubs.net/download/",proj, ".htseq_counts.tsv.gz"),destfile = paste0("~/Desktop/KIRC_analysis/",proj,".htseq_counts.tsv.gz")) ##表达数据
download.file(url = paste0("https://gdc.xenahubs.net/download/",proj, ".GDC_phenotype.tsv.gz"),destfile = paste0("~/Desktop/KIRC_analysis/",proj,".GDC_phenotype.tsv.gz")) ##临床数据
download.file(url = paste0("https://gdc.xenahubs.net/download/",proj, ".survival.tsv"),destfile = paste0("~/Desktop/KIRC_analysis/",proj,".survival.tsv")) ##生存数据
}
#读取数据
clinical = read.delim(paste0("~/Desktop/KIRC_analysis/",proj,".GDC_phenotype.tsv.gz"),fill = T,header = T,sep = "\t")
surv = read.delim(paste0("~/Desktop/KIRC_analysis/",proj,".survival.tsv"),header = T)
head(surv) #生存数据os和os.time
#1 TCGA-BP-4337-01A 1 TCGA-BP-4337 2
#2 TCGA-BP-4337-11A 1 TCGA-BP-4337 2
#3 TCGA-A3-A8CQ-01A 0 TCGA-A3-A8CQ 3
#4 TCGA-A3-A6NN-01A 0 TCGA-A3-A6NN 3
#5 TCGA-B4-5844-01A 0 TCGA-B4-5844 7
#6 TCGA-B4-5843-01A 0 TCGA-B4-5843 11
1.1 查看临床信息列名
tmp = data.frame(colnames(clinical))
#这个环节有助于提取自己想要的数据
1.2 stage
table(clinical$tumor_stage.diagnoses)
#not reported stage i stage ii stage iii stage iv
# 4 481 102 237 161
1.3 M分期
table(clinical$pathologic_M)
# M0 M1 MX
# 2 794 155 34
1.4 T分期
table(clinical$pathologic_T)
#T1 T1a T1b T2 T2a T2b T3 T3a T3b T3c T4
#37 250 205 101 15 8 7 234 102 4 22
1.5 N分期
table(clinical$pathologic_N)
# N0 N1 NX
#446 30 509
1.6 grade
table(clinical$neoplasm_histologic_grade)
# G1 G2 G3 G4 GX
# 5 21 415 385 153 6
2.处理表达矩阵和分组信息2.1 表达矩阵
#dat = read.table(paste0("input/",proj,".htseq_counts.tsv.gz"),check.names = F,row.names = 1,header = T)
dat <- data.table::fread(paste0("~/Desktop/KIRC_analysis/",proj,".htseq_counts.tsv.gz"),data.table = F) #全部是symbol
#head(dat)
#tail(dat)
a = dat[60484:60488,]
#dat的后五行是啥啊
#数据是log2(count+1),因此需要逆转并cpm处理
dat = dat[1:60483,]
rownames(dat) = dat$Ensembl_ID
a = dat
a = a[,-1]
##逆转 log
a = as.matrix(2^a - 1)
a[1:4,1:4]
# TCGA-BP-4342-01A TCGA-A3-3387-01A TCGA-BP-4167-01A TCGA-B8-4620-01A
#ENSG00000000003.13 2847 2261 2170 4586
#ENSG00000000005.5 2 17 10 3
#ENSG00000000419.11 1585 1642 979 1903
#ENSG00000000457.12 1372 895 457 505
# 用apply转换为整数矩阵
# 是因为后续edgeR分析时需要用到整数吗
exp = apply(a, 2, as.integer)
exp[1:4,1:4] #行名消失
# TCGA-BP-4342-01A TCGA-A3-3387-01A TCGA-BP-4167-01A TCGA-B8-4620-01A
# [1,] 2847 2260 2170 4586
# [2,] 2 16 10 3
# [3,] 1585 1641 979 1902
# [4,] 1371 894 456 505
rownames(exp) = rownames(dat)
exp[1:4,1:4]
# TCGA-BP-4342-01A TCGA-A3-3387-01A TCGA-BP-4167-01A TCGA-B8-4620-01A
#ENSG00000000003.13 2847 2260 2170 4586
#ENSG00000000005.5 2 16 10 3
#ENSG00000000419.11 1585 1641 979 1902
#ENSG00000000457.12 1371 894 456 505
#cpm处理
exp= log(edgeR::cpm(exp)+1)
2.2 表达矩阵行名ID转换:
library(stringr)
head(rownames(exp))
#[1] "ENSG00000000003.13" "ENSG00000000005.5" "ENSG00000000419.11"
#[4] "ENSG00000000457.12" "ENSG00000000460.15" "ENSG00000000938.11"
library(AnnoProbe)
?annoGene
#怎么匹配不上呢?是因为ENSEMBL有小数?下面这句高级一点哈哈哈
rownames(exp) = substr(rownames(exp), 1, 15)
##rownames(exp) = str_split(rownames(exp),"\\.",simplify = T)[,1];head(rownames(exp))
#annoGene(rownames(exp),ID_type = "ENSEMBL")
re = annoGene(rownames(exp),ID_type = "ENSEMBL");head(re)
# SYMBOL biotypes ENSEMBL chr start end
#1 DDX11L1 transcribed_unprocessed_pseudogene ENSG00000223972 chr1 11869 14409
#2 WASH7P unprocessed_pseudogene ENSG00000227232 chr1 14404 29570
#3 MIR6859-1 miRNA ENSG00000278267 chr1 17369 17436
#4 MIR1302-2HG lncRNA ENSG00000243485 chr1 29554 31109
#6 FAM138A lncRNA ENSG00000237613 chr1 34554 36081
#7 OR4G4P unprocessed_pseudogene ENSG00000268020 chr1 52473 53312
library(tinyarray)
#trans_array: transform rownames for microarray or rnaseq expression matrix
exp = trans_array(exp,ids = re,from = "ENSEMBL",to = "SYMBOL")
exp[1:4,1:4]
# TCGA-BP-4342-01A TCGA-A3-3387-01A TCGA-BP-4167-01A TCGA-B8-4620-01A
#DDX11L1 1 0 0 0
#WASH7P 77 69 23 29
#MIR6859-1 0 3 1 2
#MIR1302-2HG 2 0 0 0
2.4 保存数据
save(exp,proj,clinical,surv,file = paste0(proj,".Rdata"))
1.加载数据和R包
rm(list = ls())
proj = "TCGA-KIRC"
load(paste0(proj,".Rdata"))
library(tidyr)
library(tibble)
library(ggpubr)
library(dplyr)
library(stringr)
gene = "KIFC1"
1.1 分组信息处理
#根据样本ID的第14-15位,给样本分组(tumor和normal)
table(str_sub(colnames(exp),14,15))
# 01 05 11
#534 1 72
#这里的14-15位代表的样本的情况,其中01代表Primary Soild Tumor;05代表Additional -New Primary;11代表正常样本
Group = ifelse(as.numeric(str_sub(colnames(exp),14,15)) < 10,'tumor','normal')
Group = factor(Group,levels = c("normal","tumor"))
table(Group)
# Group
# normal tumor
# 72 535
2.准备数据
#提取想要的基因数据
exp_g <- exp[gene,] #这里需要输入想要分析的基因
exp_g <- as.data.frame(t(exp_g))
#准备数据
dat = t(exp_g) %>%
as.data.frame() %>% #上两步是转置信息并表格化
rownames_to_column() %>% #把行名转成一列数据
mutate(Group)
dim(dat)
# [1] 603 3
pdat = dat%>%
pivot_longer(cols =(!rowname)&(!Group), #除了 rowname 和 Group 之外的所有列
names_to = "gene", #指定新的列名,用于存储原来列的名称
values_to = "count") #指定新的列名,用于存储原来列中的值
3.正态性检验-对每一组数据进行正态性检验
#分别提取normal和tumor数据
normality_N <- pdat[pdat$Group == "normal",]
normality_T <- pdat[pdat$Group == "tumor",]
#检验数据是否满足正态分布的常见方法有:
# Shapiro-Wilk检验,Kolmogorov-Smirnov检验,Anderson-Darling检验,Lilliefors检验,Jarque-Bera检验,D'Agostino's K-squared检验
#最常用以下两种
#Shapiro-Wilk 正态性检验:最常用的正态性检验方法之一。适用于小样本数据(通常认为小于30)。
#shapiro.test(normality_N$count)
#shapiro.test(normality_T$count)
#Kolmogorov-Smirnov (K-S)正态性检验:这是一种基于经验分布函数的检验方法,适用于大样本数据。
ks.test(normality_N$count, "pnorm", mean(normality_N$count), sd(normality_N$count))
# Exact one-sample Kolmogorov-Smirnov test
# data: normality_N$count
# D = 0.16794, p-value = 0.03038
# alternative hypothesis: two-sided
ks.test(normality_T$count, "pnorm", mean(normality_T$count), sd(normality_T$count))
# Asymptotic one-sample Kolmogorov-Smirnov test
# data: normality_T$count
# D = 0.059156, p-value = 0.0473
# alternative hypothesis: two-sided
#看p值,若满足p-value>0.05,则数据满足正态分布,若不满足则用非参数检验。
#同时我们注意到,即使是KS法对样本进行分析时,也还是会按照大样本的中样本量的多与少采取不同的分析方法“Exact”/“Asymptotic”。
#直方图
hist(normality_N$count, main="Histogram for frequency") #默认是probability = F, 频数直方图
hist(normality_N$count, main="Histogram for density", probability = T) #probability = T, 密度直方图,每个区间的频率除以该区间的宽度和总的数据点数量,从而得到密度
hist(normality_T$count, main="Histogram for frequency")
hist(normality_T$count, main="Histogram for density", probability = T)
#QQ图(Quantile-Quantile Plot):这是一种图形方法,可以直观地检查数据是否接近正态分布。
#在QQ图中,如果数据点紧密地遵循参考线(45度线),则数据接近正态分布
qqnorm(normality_N$count)
qqline(normality_N$count)
qqnorm(normality_T$count)
qqline(normality_T$count)
4.方差齐性检验
#方差性(方差齐性)的检验,常用的方法是:
# Bartlett's Test 适用于正态分布数据
bartlett.test(pdat$count ~ Group, data = pdat)
# Levene's Test 适用于不确定数据是否符合正态分布的情况
library(car)
leveneTest(pdat$count ~ Group, data = pdat)
#P值小于0.05时,方差不齐
#Levene's Test for Homogeneity of Variance (center = median)
# Df F value Pr(>F)
#group 1 9.5624 0.002077 **
# 605
#---
#Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
# 该数据需采用两样本的非参数检验-wilcox.test
# 绘制小提琴图和显著性标记
library(ggplot2)
library(ggpubr)
png("p.png",width = 1200,height = 1200,res = 300)
ggplot(data=pdat,aes(x=pdat$Group,y=pdat$count,
colour = pdat$Group))+ #fill参数不要设置,会不好看
geom_violin(#color = 'grey',
alpha = 0.8, #alpha = 0.8 参数控制着小提琴图的透明度。
scale = 'width',#小提琴宽度
#linewidth = 1, #外轮廓粗细
trim = TRUE)+ # trim = TRUE 参数控制着小提琴图的形状。
geom_boxplot(mapping=aes(x=pdat$Group,y=pdat$count,
colour=pdat$Group,fill=pdat$Group), #箱线图
alpha = 0.5,
size=1.5,
width = 0.3)+
geom_jitter(mapping=aes(x=pdat$Group,y=pdat$count,colour=pdat$Group), #散点
alpha = 0.3,size=3)+
scale_fill_manual(limits=c("normal","tumor"),
values =c("#9E9E9E","#F5C96B"))+
scale_color_manual(limits=c("normal","tumor"),
values=c("#9E9E9E","#F5C96B"))+ #颜色
geom_signif(mapping=aes(x=pdat$Group,y=pdat$count), # 不同组别的显著性
comparisons = list(c("normal","tumor")), # 哪些组进行比较
map_signif_level=T, # T显示显著性,F显示p value
tip_length=c(0,0),#把向下的帽子去掉,分组数乘以2
y_position = c(4.5), # 设置显著性线的位置高度
size=1, # 修改线的粗细
textsize = 4, # 修改显著性标记的大小
test = "wilcox.test", # 检验的类型,可以更改
color = "black")+ # 设置显著性线的颜色
theme_bw()+ #设置白色背景
guides(fill = guide_legend(title = "Group"), # 设置填充图例的标题
color = guide_legend(title = "Group"))+ # 设置颜色图例的标题
labs(title = "TCGA-KIRC", # 设置标题
x="",y= paste0("The mRNA expression of ",gene)) # 添加标题,x轴,y轴标签
dev.off()
#查看典型管家基因(如:GAPDH、ACTB)的表达量,如果表达量高于正常值,说明我们数据没问题。
exp['GAPDH',]
exp['ACTB',]
#随机取样基因,对比管家基因的表达
#我们可以看到我们数据中两个管家基因的表达量都偏高,符合预期。
cg = exp[sample(nrow(exp), 3), ]
boxplot(cg)
##所以数据也没啥问题
1.数据处理
表达矩阵需要tumor和normal,新表达矩阵数据命名为exprSet;临床信息需要进一步整理,成为生存分析需要的格式,新临床信息数据命名为meta。
#加载数据和R包
rm(list=ls())
proj = "TCGA-KIRC"
load(paste0(proj,".Rdata"))
library(stringr)
library(dplyr)
1.1 数据准备
#临床信息和表达矩阵取交集
rownames(clinical) <- clinical$submitter_id.samples
meta <- clinical
exprSet <- exp
p <- identical(rownames(meta),colnames(exprSet));p
if(!p) {
s = intersect(rownames(meta),colnames(exprSet))
exprSet = exprSet[,s]
meta = meta[s,]
}
1.2 选列并简化列名
tmp <- as.data.frame(colnames(meta))
#如果有其他感兴趣的指标可以多提取一些
meta = meta[,c(
'submitter_id.samples',
'neoplasm_histologic_grade',
'pathologic_T' ,
'pathologic_M',
'pathologic_N',
'age_at_index.demographic',
'gender.demographic' ,
'tumor_stage.diagnoses')]
colnames(meta)=c('ID','grade','T_stage','M_stage','N_stage','age','gender','stage')
1.3 简化、规范内容
#检查各列的信息是否规范 没有冗余信息 缺失的信息用NA代替
#——Stage——————————————————————————————————
table(meta$stage)
#not reported stage i stage ii stage iii stage iv
# 3 294 69 139 102
meta$stage = meta$stage %>%
str_replace("not reported",as.character(NA)) %>%
str_remove("stage ") %>%
str_to_upper() # 罗马数字1、2、3、4小写变大写
stage = meta$stage
#下面这段内容可以在合适的情况下启用
#stage = case_when(stage == 'I' ~ 'I',
# stage == 'II' ~ 'II',
# stage == 'III' ~ 'III',
# stage == 'IIIA' ~ 'III',
# stage == 'IIIB' ~ 'III',
# stage == 'IIIC' ~ 'III',
# stage == 'IV' ~ 'IV',
# stage == 'IVB' ~ 'IV',
# TRUE ~ as.character(T))
#table(stage) #如果上述这段启用了,后续需要赋值
#meta$stage = stage #如果上述这段启用了,后续需要赋值
#把缺失的值和不能确定的值都变为空值
#下面的代码是指把stage中不包含I的内容全部转换成字符型的NA数据
meta$stage[!str_detect(meta$stage,"I")]=as.character(NA) #na替换空
table(meta$stage,useNA = "always") #always的含义是总是计算NA
# I II III IV <NA>
# 294 69 139 102 3
#——T_stage——————————————————————————————————
T_stage = meta$T_stage
T_stage = case_when(T_stage == 'T1' ~ 'T1',
T_stage == 'T1a' ~ 'T1',
T_stage == 'T1b' ~ 'T1',
T_stage == 'T2' ~ 'T2',
T_stage == 'T2a' ~ 'T2',
T_stage == 'T2b' ~ 'T2',
T_stage == 'T3' ~ 'T3',
T_stage == 'T3a' ~ 'T3',
T_stage == 'T3b' ~ 'T3',
T_stage == 'T3c' ~ 'T3',
T_stage == 'T4' ~ 'T4',
TRUE ~ as.character(T)) #对于所有其他情况,保持原来的值不变。
table(T_stage)
#T_stage
# T1 T2 T3 T4
#302 84 208 13
meta$T_stage = T_stage
meta$T_stage[!str_detect(meta$T_stage,"T")]=as.character(NA) #na替换空
#meta$T_stage[str_detect(meta$T_stage,"TX")]=as.character(NA) #na替换空
#table(meta$T_stage)
#——grade————————————————————————————————————
table(meta$grade,useNA = "always")
# G1 G2 G3 G4 GX <NA>
# 3 15 259 235 90 5 0
meta$grade[!str_detect(meta$grade,"G")]=as.character(NA) #na替换空
table(meta$grade,useNA = "always")
# G1 G2 G3 G4 GX <NA>
# 15 259 235 90 5 3
#——M_stage——————————————————————————
table(meta$M_stage,useNA = "always")
# M0 M1 MX <NA>
# 2 477 97 31 0
meta$M_stage[!str_detect(meta$M_stage,"M")]=as.character(NA) #na替换空
meta$M_stage[str_detect(meta$M_stage,"MX")]=as.character(NA) #MX替换空
table(meta$M_stage,useNA = "always")
# M0 M1 <NA>
# 2 477 97 31
#——N_stage————————————————
meta$N_stage[!str_detect(meta$N_stage,"N")]=as.character(NA) #na替换空
meta$N_stage[str_detect(meta$N_stage,"NX")]=as.character(NA) #NX替换空
table(meta$N_stage,useNA = "always")
# N0 N1 <NA>
# 277 18 307
#——gender——————————————————————
table(meta$gender,useNA = "always")
#——age————————————————————
table(meta$age,useNA = "always")
meta$age = ifelse(as.numeric(str_sub(meta$age)) < 60,'<=60','>60')
table(meta$age,useNA = "always")
#<=60 >60 <NA>
# 276 331 0
1.4 实现表达矩阵与临床信息的匹配
#因为临床信息那一环节去除了没有生存的数据,所以样本量有了变化
#在保存之前重新匹配一下
rownames(meta) <- meta$ID
s = intersect(rownames(meta),colnames(exprSet));length(s)
#[1] 607
exprSet = exprSet[,s]
identical(rownames(meta),colnames(exprSet))
#> [1] TRUE
1.5 分组信息处理
#这次分组完只要后面顺序不变样本不删除就没啥问题
#这里的分组是按照exprSet的列名去分的哦! exprSet和meta的顺序是一致的
#根据样本ID的第14-15位,给样本分组(tumor和normal)
table(str_sub(colnames(exprSet),14,15))
#这里的14-15位代表的样本的情况,其中01代表Primary Soild Tumor;05代表Additional -New Primary;11代表正常样本
Group = ifelse(as.numeric(str_sub(colnames(exprSet),14,15)) < 10,'tumor','normal')
Group = factor(Group,levels = c("normal","tumor"))
table(Group)
# Group
# normal tumor
# 72 535
save(meta,exprSet,Group,surv,proj,file = paste0(proj,"_clinical_exp.Rdata"))
1.加载数据和R包
rm(list = ls())
proj = "TCGA-KIRC"
load(paste0(proj,"_clinical_exp.Rdata"))
library(tidyr)
library(tibble)
library(ggpubr)
library(dplyr)
#该部分内容除了修改gene信息以外,还需要注意修改临床参数。因为不同数据中clinical参数会有不同
gene = "KIFC1"
2.准备数据
#提取想要的基因数据
g <- exprSet[gene,]
meta <- cbind(meta,g)
#在把Group合并上去
meta <- cbind(meta,Group)
#把临床信息分组情况做好标注
meta$grade <- ifelse(meta$Group == "normal","Normal",meta$grade)
meta$stage <- ifelse(meta$Group == "normal","Normal",meta$stage)
meta$T_stage <- ifelse(meta$Group == "normal","Normal",meta$T_stage)
meta$N_stage <- ifelse(meta$Group == "normal","Normal",meta$N_stage)
meta$M_stage <- ifelse(meta$Group == "normal","Normal",meta$M_stage)
3.临床参数绘图
3.1 T_stage
# 多分组数据可采用kruskal-wallis test
# 其中两样本采用非参数检验-wilcox.test
#存在NA数据因此需要把数据单独提取出来之后去除NA行
meta_T <- na.omit(meta[,c(3,9)])
meta_T$T_stage <- factor(meta_T$T_stage, levels = c("Normal","T1","T2","T3","T4"))
# 绘制小提琴图和显著性标记
library(ggplot2)
library(ggpubr)
png(paste0(gene,"_T_stage.png"),width = 1200,height = 1200,res = 300)
ggplot(data=meta_T,aes(x=T_stage,y=g,colour = T_stage))+ #fill参数不要设置,会不好看
geom_violin(#color = 'grey',
alpha = 0.8, #alpha = 0.8 参数控制着小提琴图的透明度。
scale = 'width',#小提琴宽度
#linewidth = 1, #外轮廓粗细
trim = TRUE)+ # trim = TRUE 参数控制着小提琴图的形状。
geom_boxplot(mapping=aes(x=T_stage,y=g,colour = T_stage,fill=T_stage), #箱线图
alpha = 0.5,size=1.5,width = 0.3)+
geom_jitter(mapping=aes(x=T_stage,y=g,colour = T_stage), #散点
alpha = 0.3,size=3)+
scale_fill_manual(limits=c("Normal","T1","T2","T3","T4"),
values =c("#312613","#BFA16F","#FB8C82","#B0C8CA","#F7EEE2"))+
scale_color_manual(limits=c("Normal","T1","T2","T3","T4"),
values=c("#312613","#BFA16F","#FB8C82","#B0C8CA","#F7EEE2"))+ #颜色
geom_signif(mapping=aes(x=T_stage,y=g), # 不同组别的显著性
comparisons = list(c("Normal","T1"),
c("Normal","T2"),
c("Normal","T3"),
c("Normal","T4"),
c("T1","T2"),
c("T1","T3"),
c("T1","T4"),
c("T2","T3"),
c("T2","T4"),
c("T3","T4")), # 哪些组进行比较
map_signif_level=T, # T显示显著性,F显示p value
tip_length=c(0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0),#把向下的帽子去掉,分组数乘以2
y_position = c(4.5,4.8,5.1,5.4,5.7,6,6.3,6.6,6.9,7.2), # 设置显著性线的位置高度
size=0.5, # 修改线的粗细
textsize = 2, # 修改显著性标记的大小
test = "wilcox.test", # 检验的类型,可以更改
color = "black")+ # 设置显著性线的颜色
theme_bw()+ #设置白色背景
guides(fill = guide_legend(title = "Group"), # 设置填充图例的标题
color = guide_legend(title = "Group"))+ # 设置颜色图例的标题
labs(title = "TCGA-KIRC", # 设置标题
x="",y= paste0("The mRNA expression of ",gene)) # 添加标题,x轴,y轴标签
dev.off()
3.2 N_stage
# 多分组数据可采用kruskal-wallis test
# 其中两样本采用非参数检验-wilcox.test
#存在NA数据因此需要把数据单独提取出来之后去除NA行
meta_N <- na.omit(meta[,c(5,9)])
meta_N$N_stage <- factor(meta_N$N_stage, levels = c("Normal", "N0", "N1"))
# 绘制小提琴图和显著性标记
library(ggplot2)
library(ggpubr)
png(paste0(gene,"_N_stage.png"),width = 1200,height = 1200,res = 300)
ggplot(data=meta_N,aes(x=N_stage,y=g,colour = N_stage))+ #fill参数不要设置,会不好看
geom_violin(#color = 'grey',
alpha = 0.8, #参数控制着小提琴图的透明度。
scale = 'width',#小提琴宽度
#linewidth = 1, #外轮廓粗细
trim = TRUE #参数控制着小提琴图的形状
) +
geom_boxplot(mapping=aes(x=N_stage,y=g,colour = N_stage,fill=N_stage), #箱线图
alpha = 0.5,size=1.5,width = 0.3)+
geom_jitter(mapping=aes(x=N_stage,y=g,colour = N_stage),
alpha = 0.3,size=3)+#散点
scale_fill_manual(limits=c("Normal","N0","N1"),
values =c("#312613","#BFA16F","#FB8C82"))+
scale_color_manual(limits=c("Normal","N0","N1"),
values=c("#312613","#BFA16F","#FB8C82"))+ #颜色
geom_signif(mapping=aes(x=N_stage,y=g), # 不同组别的显著性
comparisons = list(c("Normal","N0"),
c("Normal","N1"),
c("N0","N1")), # 哪些组进行比较
map_signif_level=T, # T显示显著性,F显示p value
tip_length=c(0,0,0,0,0,0),#把向下的帽子去掉,分组数乘以2
y_position = c(4.5,4.8,5.1), # 设置显著性线的位置高度
size=0.5, # 修改线的粗细
textsize = 2, # 修改显著性标记的大小
test = "wilcox.test", # 检验的类型,可以更改
color = "black",
)+ # 设置显著性线的颜色
theme_bw()+ #设置白色背景
guides(fill = guide_legend(title = "Group"), # 设置填充图例的标题
color = guide_legend(title = "Group"))+ # 设置颜色图例的标题
labs(title = "TCGA-KIRC", # 设置标题
x="",y= paste0("The mRNA expression of ",gene)) # 添加标题,x轴,y轴标签
dev.off()
3.3 M_stage
# 多分组数据可采用kruskal-wallis test
# 其中两样本采用非参数检验-wilcox.test
#存在NA数据因此需要把数据单独提取出来之后去除NA行
meta_M <- na.omit(meta[,c(4,9)])
meta_M$M_stage <- factor(meta_M$M_stage, levels = c("Normal", "M0", "M1"))
# 绘制小提琴图和显著性标记
library(ggplot2)
library(ggpubr)
png(paste0(gene,"_M_stage.png"),width = 1200,height = 1200,res = 300)
ggplot(data=meta_M,aes(x=M_stage,y=g,colour = M_stage))+ #fill参数不要设置,会不好看
geom_violin(alpha = 0.8, #参数控制着小提琴图的透明度。
scale = 'width',#小提琴宽度
#linewidth = 1, #外轮廓粗细
trim = TRUE ) + #参数控制着小提琴图的形状
geom_boxplot(mapping=aes(x=M_stage,y=g,
colour = M_stage,fill=M_stage), #箱线图
alpha = 0.5,size=1.5,width = 0.3)+
geom_jitter(mapping=aes(x=M_stage,y=g,colour = M_stage), #散点
alpha = 0.3,size=3)+
scale_fill_manual(limits=c("Normal","M0","M1"),
values =c("#312613","#BFA16F","#FB8C82"))+
scale_color_manual(limits=c("Normal","M0","M1"),
values=c("#312613","#BFA16F","#FB8C82"))+ #颜色
geom_signif(mapping=aes(x=M_stage,y=g), # 不同组别的显著性
comparisons = list(c("Normal","M0"),
c("Normal","M1"),
c("M0","M1")), # 哪些组进行比较
map_signif_level=T, # T显示显著性,F显示p value
tip_length=c(0,0,0,0,0,0),#把向下的帽子去掉,分组数乘以2
y_position = c(4.5,4.8,5.1), # 设置显著性线的位置高度
size=0.5, # 修改线的粗细
textsize = 2, # 修改显著性标记的大小
test = "wilcox.test", # 检验的类型,可以更改
color = "black",
)+ # 设置显著性线的颜色
theme_bw()+ #设置白色背景
guides(fill = guide_legend(title = "Group"), # 设置填充图例的标题
color = guide_legend(title = "Group"))+ # 设置颜色图例的标题
labs(title = "TCGA-KIRC", # 设置标题
x="",y= paste0("The mRNA expression of ",gene)) # 添加标题,x轴,y轴标签
dev.off()
3.4 Grade_stage
# 多分组数据可采用kruskal-wallis test
# 其中两样本采用非参数检验-wilcox.test
#存在NA数据因此需要把数据单独提取出来之后去除NA行
meta_grade <- na.omit(meta[,c(2,9)])
meta_grade$grade <- factor(meta_grade$grade,
levels = c("Normal","G1", "G2","G3","G4")) #自动把不要的数据变NA
meta_grade <- na.omit(meta_grade)
#如果想要GX数据就修改参数和后面代码
#meta_grade$grade <- factor(meta_grade$grade, levels = c("Normal","G1", "G2","G3","G4","GX"))
# 绘制小提琴图和显著性标记
library(ggplot2)
library(ggpubr)
png(paste0(gene,"_grade.png"),width = 1200,height = 1200,res = 300)
ggplot(data=meta_grade,aes(x=grade,y=g,colour = grade))+ #fill参数不要设置,会不好看
geom_violin(alpha = 0.8, #参数控制着小提琴图的透明度。
scale = 'width',#小提琴宽度
#linewidth = 1, #外轮廓粗细
trim = TRUE #参数控制着小提琴图的形状
) +
geom_boxplot(mapping=aes(x=grade,y=g,colour = grade,fill=grade), #箱线图
alpha = 0.5,size=1.5,width = 0.3)+
geom_jitter(mapping=aes(x=grade,y=g,colour = grade), #散点
alpha = 0.3,size=3)+
scale_fill_manual(limits=c("Normal","G1","G2","G3","G4"),
values =c("#312613","#BFA16F","#FB8C82","#B0C8CA","#F7EEE2"))+
scale_color_manual(limits=c("Normal","G1","G2","G3","G4"),
values=c("#312613","#BFA16F","#FB8C82","#B0C8CA","#F7EEE2"))+ #颜色
geom_signif(mapping=aes(x=grade,y=g), # 不同组别的显著性
comparisons = list(c("Normal","G1"),
c("Normal","G2"),
c("Normal","G3"),
c("Normal","G4"),
c("G1","G2"),
c("G1","G3"),
c("G1","G4"),
c("G2","G3"),
c("G2","G4"),
c("G3","G4")), # 哪些组进行比较
map_signif_level=T, # T显示显著性,F显示p value
tip_length=c(0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0),#把向下的帽子去掉,分组数乘以2
y_position = c(4.5,4.8,5.1,5.4,5.7,6,6.3,6.6,6.9,7.2), # 设置显著性线的位置高度
size=0.5, # 修改线的粗细
textsize = 2, # 修改显著性标记的大小
test = "wilcox.test", # 检验的类型,可以更改
color = "black",
)+ # 设置显著性线的颜色
theme_bw()+ #设置白色背景
guides(fill = guide_legend(title = "Group"), # 设置填充图例的标题
color = guide_legend(title = "Group"))+ # 设置颜色图例的标题
labs(title = "TCGA-KIRC", # 设置标题
x="",y= paste0("The mRNA expression of ",gene)) # 添加标题,x轴,y轴标签
dev.off()
3.5 stage
# 多分组数据可采用kruskal-wallis test
# 其中两样本采用非参数检验-wilcox.test
#存在NA数据因此需要把数据单独提取出来之后去除NA行
meta_stage <- na.omit(meta[,c(8,9)])
meta_stage$stage <- factor(meta_stage$stage, levels = c("Normal","I", "II","III","IV"))
meta_stage <- na.omit(meta_stage) #这一步可以不删除,分期数会出现X
# 绘制小提琴图和显著性标记
library(ggplot2)
library(ggpubr)
png(paste0(gene,"_stage.png"),width = 1200,height = 1200,res = 300)
ggplot(data=meta_stage,aes(x=stage,y=g,colour = stage))+ #fill参数不要设置,会不好看
geom_violin(alpha = 0.8, #参数控制着小提琴图的透明度。
scale = 'width',#小提琴宽度
#linewidth = 1, #外轮廓粗细
trim = TRUE ) + #参数控制着小提琴图的形状
geom_boxplot(mapping=aes(x=stage,y=g,colour = stage,fill=stage), #箱线图
alpha = 0.5,size=1.5,width = 0.3)+
geom_jitter(mapping=aes(x=stage,y=g,colour = stage),alpha = 0.3,size=3)+#散点
scale_fill_manual(limits=c("Normal","I","II","III","IV"),
values =c("#312613","#BFA16F","#FB8C82","#B0C8CA","#F7EEE2"))+
scale_color_manual(limits=c("Normal","I","II","III","IV"),
values=c("#312613","#BFA16F","#FB8C82","#B0C8CA","#F7EEE2"))+ #颜色
geom_signif(mapping=aes(x=stage,y=g), # 不同组别的显著性
comparisons = list(c("Normal","I"),
c("Normal","II"),
c("Normal","III"),
c("Normal","IV"),
c("I","II"),
c("I","III"),
c("I","IV"),
c("II","III"),
c("II","IV"),
c("III","IV")), # 哪些组进行比较
map_signif_level=T, # T显示显著性,F显示p value
tip_length=c(0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0),#把向下的帽子去掉,分组数乘以2
y_position = c(4.5,4.8,5.1,5.4,5.7,6,6.3,6.6,6.9,7.2), # 设置显著性线的位置高度
size=0.5, # 修改线的粗细
textsize = 2, # 修改显著性标记的大小
test = "wilcox.test", # 检验的类型,可以更改
color = "black",
)+ # 设置显著性线的颜色
theme_bw()+ #设置白色背景
guides(fill = guide_legend(title = "Group"), # 设置填充图例的标题
color = guide_legend(title = "Group"))+ # 设置颜色图例的标题
labs(title = "TCGA-KIRC", # 设置标题
x="",y= paste0("The mRNA expression of ",gene)) # 添加标题,x轴,y轴标签
dev.off()
3.6 age
# 其中两样本采用非参数检验-wilcox.test/或者符合正态分布就用t-test
#存在NA数据因此需要把数据单独提取出来之后去除NA行
meta_age <- na.omit(meta[,c(6,9)])
meta_age <- meta_age[Group == "tumor",]
meta_age$age <- factor(meta_age$age, levels = c("<=60",">60"))
meta_age <- na.omit(meta_age) #这一步可以不删除,也许出现X数据,因子化之后变NA去除
# 绘制小提琴图和显著性标记
library(ggplot2)
library(ggpubr)
png(paste0(gene,"_age.png"),width = 1200,height = 1200,res = 300)
ggplot(data=meta_age,aes(x=age,y=g,colour = age))+ #fill参数不要设置,会不好看
geom_violin(alpha = 0.8, #参数控制着小提琴图的透明度。
scale = 'width',#小提琴宽度
#linewidth = 1, #外轮廓粗细
trim = TRUE ) + #参数控制着小提琴图的形状
geom_boxplot(mapping=aes(x=age,y=g,colour = age,fill=age), #箱线图
alpha = 0.5,size=1.5,width = 0.3)+
geom_jitter(mapping=aes(x=age,y=g,colour = age), #散点
alpha = 0.3,size=3)+
scale_fill_manual(limits=c("<=60",">60"),
values =c("#312613","#BFA16F"))+
scale_color_manual(limits=c("<=60",">60"),
values=c("#312613","#BFA16F"))+ #颜色
geom_signif(mapping=aes(x=age,y=g), # 不同组别的显著性
comparisons = list(c("<=60",">60")), # 哪些组进行比较
map_signif_level=T, # T显示显著性,F显示p value
tip_length=c(0,0),#把向下的帽子去掉,分组数乘以2
y_position = c(4.5), # 设置显著性线的位置高度
size=0.5, # 修改线的粗细
textsize = 2, # 修改显著性标记的大小
test = "wilcox.test", # 检验的类型,可以更改
color = "black",
)+ # 设置显著性线的颜色
theme_bw()+ #设置白色背景
guides(fill = guide_legend(title = "Group"), # 设置填充图例的标题
color = guide_legend(title = "Group"))+ # 设置颜色图例的标题
labs(title = "TCGA-KIRC", # 设置标题
x="",y= paste0("The mRNA expression of ",gene)) # 添加标题,x轴,y轴标签
dev.off()
3.7 gender
# 其中两样本采用非参数检验-wilcox.test/或者符合正态分布就用t-test
#存在NA数据因此需要把数据单独提取出来之后去除NA行
meta_gender <- na.omit(meta[,c(7,9)])
meta_gender <- meta_gender[Group == "tumor",]
meta_gender$gender <- factor(meta_gender$gender, levels = c("male","female"))
meta_gender <- na.omit(meta_gender) #这一步可以不删除,也许出现X数据,因子化之后变NA去除
# 绘制小提琴图和显著性标记
library(ggplot2)
library(ggpubr)
png(paste0(gene,"_gender.png"),width = 1200,height = 1200,res = 300)
ggplot(data=meta_gender,aes(x=gender,y=g,colour = gender))+ #fill参数不要设置,会不好看
geom_violin(alpha = 0.8, #参数控制着小提琴图的透明度。
scale = 'width',#小提琴宽度
#linewidth = 1, #外轮廓粗细
trim = TRUE ) + #参数控制着小提琴图的形状
geom_boxplot(mapping=aes(x=gender,y=g,colour = gender,fill=gender), #箱线图
alpha = 0.5,size=1.5,width = 0.3)+
geom_jitter(mapping=aes(x=gender,y=g,colour = gender), alpha = 0.3,size=3)+#散点
scale_fill_manual(limits=c("male","female"),
values =c("#312613","#BFA16F"))+
scale_color_manual(limits=c("male","female"),
values=c("#312613","#BFA16F"))+ #颜色
geom_signif(mapping=aes(x=gender,y=g), # 不同组别的显著性
comparisons = list(c("male","female")), # 哪些组进行比较
map_signif_level=T, # T显示显著性,F显示p value
tip_length=c(0,0),#把向下的帽子去掉,分组数乘以2
y_position = c(4.5), # 设置显著性线的位置高度
size=0.5, # 修改线的粗细
textsize = 2, # 修改显著性标记的大小
test = "wilcox.test", # 检验的类型,可以更改
color = "black",
)+ # 设置显著性线的颜色
theme_bw()+ #设置白色背景
guides(fill = guide_legend(title = "Group"), # 设置填充图例的标题
color = guide_legend(title = "Group"))+ # 设置颜色图例的标题
labs(title = "TCGA-KIRC", # 设置标题
x="",y= paste0("The mRNA expression of ",gene)) # 添加标题,x轴,y轴标签
dev.off()
1.生存和临床数据处理
表达矩阵只需要tumor数据,不要normal,将其去掉,新表达矩阵数据命名为exprSet;临床信息需要进一步整理,成为生存分析需要的格式,新临床信息数据命名为meta。
#加载数据和R包
rm(list = ls())
proj = "TCGA-KIRC"
load(paste0(proj,"_clinical_exp.Rdata"))
library(stringr)
1.1去除normal样本-先从临床数据取再跟表达矩阵取交集比较好
table(Group)
#Group
#normal tumor
# 72 535
meta <- meta[Group == "tumor",]
table(str_sub(rownames(meta),14,15)) #确认数据是否都是tumor
# 01 05
#534 1
1.2 生存和临床数据合并
library(dplyr)
nrow(surv)
#[1] 979
nrow(meta)
#[1] 535
meta = left_join(meta,surv,by = c("ID"= "sample"))
nrow(meta)
#[1] 535
meta <- meta[,!colnames(meta) == "X_PATIENT"] #去除多余列
1.3 样本信息过滤
#去掉生存信息不全或者生存时间小于30天的样本,样本纳排标准不唯一,且差别很大.
#仁者见仁智者见智,是否去除自己看着办
k1 = meta$OS.time >= 30;table(k1)
#k1
#FALSE TRUE
# 13 518
#NA的数据是一定要去除的,生存分析不能有NA数据
k2 = !(is.na(meta$OS.time)|is.na(meta$OS));table(k2)
#k2
#FALSE TRUE
# 4 531
meta = meta[k1&k2,]
1.4 选列并简化列名
colnames(meta)=c('ID','grade','T_stage','M_stage','N_stage','age','gender','stage','event','time')
1.5 生存数据简化及规范
#(1)生存时间 认清生存时间的单位(通常是月,也可以用天和年);
range(meta$time)
#[1] 36 4537
meta$time = meta$time/30 #改成月
#meta$time = meta$time/12 #还可以进一步改成年
range(meta$time)
#[1] 1.2000 151.2333
1.6 实现表达矩阵与临床信息的匹配
rownames(meta) <- meta$ID
s = intersect(rownames(meta),colnames(exprSet));length(s)
#[1] 518
exprSet = exprSet[,s]
#确认顺序
identical(rownames(meta),colnames(exprSet))
#[1] TRUE
save(meta,exprSet,proj,file = paste0(proj,"_sur_analysis.Rdata"))
1.输入数据
rm(list = ls())
proj = "TCGA-KIRC"
load(paste0(proj,"_sur_analysis.Rdata"))
ls() #显示当前载入的内容是哪些
exprSet[1:4,1:4]
str(meta)
library(stringr)
library(survival)
library(survminer)
#输入基因名
gene = "KIFC1"
2.修改基因行名
rownames(exprSet)= str_replace_all(rownames(exprSet),"-","_")
#for循环生存分析
#特异写了循环函数,假设基因数量多了之后可以修改后使用
for (g in gene) {
meta[[g]] = ifelse((exprSet[rownames(exprSet) == g,]) <
median(exprSet[rownames(exprSet) == g,]),'low','high')
meta[[g]] = factor(meta[[g]],levels = c("low","high"))
sfit <- survfit(as.formula(paste("Surv(time, event) ~", g)), data=meta)
ggsurvplot(sfit,pval=TRUE)
png(filename = paste0(g,"_median.png"), width = 8, height = 6, units = "in", res = 300)
p <- ggsurvplot(
sfit,
censor.shape="|",
censor.size = 4,
conf.int = TRUE,
conf.int.style = "ribbon",
conf.int.alpha = 0.2,
pval = TRUE,
palette = "jco",
#surv.median.line = "hv",
ggtheme =theme_bw(),
legend = "top",
legend.labs = c("Low","High"),
xlab = "OS_time(Month)",
ylab = "Survival probability",
title = paste0(g," Influence"),#将标题根据gene分别命名
break.x.by = 24,#24个月界限
break.y.by = 0.1,
risk.table = TRUE,
risk.table.col = "strata",
risk.table.height = 0.2,
risk.table.y.text = FALSE
)
print(p)
dev.off() # 关闭设备,完成保存
}
4.生存分析-最佳截断值法
#for循环生存分析
#特异写了循环函数,假设基因数量多了之后可以修改后使用
for (g in gene) {
meta[[g]] = exprSet[rownames(exprSet) == g,] #不需要转置哦。自动转置
res.cut = survminer::surv_cutpoint(meta,
time = "time",
event = "event",
variables = g)
cut = res.cut[["cutpoint"]][1, 1]
meta[[paste0(g,"_group")]] = ifelse(meta[[g]] < cut,'low','high')
meta[[paste0(g,"_group")]] = factor(meta[[paste0(g,"_group")]],levels = c('low','high'))
sfit <- survfit(as.formula(paste("Surv(time, event) ~", paste0(g,"_group"))), data=meta)
ggsurvplot(sfit,pval=TRUE)
png(filename = paste0(g,"_best_cut.png"), width = 8, height = 6, units = "in", res = 300)
p1 <- ggsurvplot(
sfit,
censor.shape="|",
censor.size = 4,
conf.int = TRUE,
conf.int.style = "ribbon",
conf.int.alpha = 0.2,
pval = TRUE,
palette = "jco",
#surv.median.line = "hv",
ggtheme =theme_bw(),
legend = "top",
legend.labs = c("Low","High"),
xlab = "OS_time(Month)",
ylab = "Survival probability",
title = paste0(g," Influence"),#将标题根据gene分别命名
break.x.by = 24,#24个月界限
break.y.by = 0.1,
risk.table = TRUE,
risk.table.col = "strata",
risk.table.height = 0.2,
risk.table.y.text = FALSE
)
print(p1)
dev.off() # 关闭设备,完成保存
}
5.生存分析-四分位法
#for循环生存分析
#特异写了循环函数,假设基因数量多了之后可以修改后使用
for (g in gene) {
meta[[g]] = exprSet[rownames(exprSet) == g,] #不需要转置哦。自动转置
# 计算四分位数
quartiles <- quantile(meta[[g]], probs = c(0.25, 0.5, 0.75))
# 根据四分位数分组
meta[[paste0(g, "_group")]] <- cut(meta[[g]],
breaks = c(-Inf, quartiles, Inf),
labels = c("Q1", "Q2", "Q3", "Q4"))
# 确保分组变量是因子并设置因子水平顺序
meta[[paste0(g, "_group")]] <- factor(meta[[paste0(g, "_group")]], levels = c("Q1", "Q2", "Q3", "Q4"))
sfit <- survfit(as.formula(paste("Surv(time, event) ~", paste0(g,"_group"))), data=meta)
ggsurvplot(sfit,pval=TRUE)
png(filename = paste0(g,"_quantile.png"), width = 8, height = 8, units = "in", res = 300)
p1 <- ggsurvplot(
sfit,
censor.shape="|",
censor.size = 4,
conf.int = TRUE,
conf.int.style = "ribbon",
conf.int.alpha = 0.2,
pval = TRUE,
palette = "jco",
#surv.median.line = "hv",
ggtheme =theme_bw(),
legend = "top",
legend.labs = c("Q1", "Q2", "Q3", "Q4"),
xlab = "OS_time(Month)",
ylab = "Survival probability",
title = paste0(g," Influence"),#将标题根据gene分别命名
break.x.by = 24,#24个月界限
break.y.by = 0.1,
risk.table = TRUE,
risk.table.col = "strata",
risk.table.height = 0.2,
risk.table.y.text = FALSE
)
print(p1)
dev.off() # 关闭设备,完成保存
}
小结:这样的一套代码就可以基本解决单基因分析时常见的问题啦~。
致谢:感谢曾老师,小洁老师以及生信技能树团队全体成员(部分代码来源:生信技能树马拉松和数据挖掘课程)。
注:若对内容有疑惑或者有发现明确错误的朋友,请联系后台(希望多多交流)。
- END -如何能够像网页工具一样,仅输入基因名称,通过一套代码就可以获得基因表达量在不同样本中的表达量差异图,基因表达量差异与临床病理参数之间的关系图以及表达量差异与生存的关系图呢?
生信菜鸟团既往内容链接:
根据TMN以及stage和grade等临床信息查看TCGA的LIHC表达量矩阵的SLC25A11基因情况
这次曾老师就交代了这样的作业,希望能够只输入基因名就能把A-E的图给呈现出来。
PMID:37789080
rm(list=ls())
#打破下载时间的限制,改前60秒,改后1亿秒
options(timeout = 100000000)
options(scipen = 20)#不要以科学计数法表示
library(tidyverse)
library(data.table)
proj = "TCGA-KIRC"
#下载内容之前需要正确填写destfile的位置哦
if(T){
download.file(url = paste0("https://gdc.xenahubs.net/download/",proj, ".htseq_counts.tsv.gz"),destfile = paste0("~/Desktop/KIRC_analysis/",proj,".htseq_counts.tsv.gz")) ##表达数据
download.file(url = paste0("https://gdc.xenahubs.net/download/",proj, ".GDC_phenotype.tsv.gz"),destfile = paste0("~/Desktop/KIRC_analysis/",proj,".GDC_phenotype.tsv.gz")) ##临床数据
download.file(url = paste0("https://gdc.xenahubs.net/download/",proj, ".survival.tsv"),destfile = paste0("~/Desktop/KIRC_analysis/",proj,".survival.tsv")) ##生存数据
}
#读取数据
clinical = read.delim(paste0("~/Desktop/KIRC_analysis/",proj,".GDC_phenotype.tsv.gz"),fill = T,header = T,sep = "\t")
surv = read.delim(paste0("~/Desktop/KIRC_analysis/",proj,".survival.tsv"),header = T)
head(surv) #生存数据os和os.time
#1 TCGA-BP-4337-01A 1 TCGA-BP-4337 2
#2 TCGA-BP-4337-11A 1 TCGA-BP-4337 2
#3 TCGA-A3-A8CQ-01A 0 TCGA-A3-A8CQ 3
#4 TCGA-A3-A6NN-01A 0 TCGA-A3-A6NN 3
#5 TCGA-B4-5844-01A 0 TCGA-B4-5844 7
#6 TCGA-B4-5843-01A 0 TCGA-B4-5843 11
1.1 查看临床信息列名
tmp = data.frame(colnames(clinical))
#这个环节有助于提取自己想要的数据
1.2 stage
table(clinical$tumor_stage.diagnoses)
#not reported stage i stage ii stage iii stage iv
# 4 481 102 237 161
1.3 M分期
table(clinical$pathologic_M)
# M0 M1 MX
# 2 794 155 34
1.4 T分期
table(clinical$pathologic_T)
#T1 T1a T1b T2 T2a T2b T3 T3a T3b T3c T4
#37 250 205 101 15 8 7 234 102 4 22
1.5 N分期
table(clinical$pathologic_N)
# N0 N1 NX
#446 30 509
1.6 grade
table(clinical$neoplasm_histologic_grade)
# G1 G2 G3 G4 GX
# 5 21 415 385 153 6
2.处理表达矩阵和分组信息2.1 表达矩阵
#dat = read.table(paste0("input/",proj,".htseq_counts.tsv.gz"),check.names = F,row.names = 1,header = T)
dat <- data.table::fread(paste0("~/Desktop/KIRC_analysis/",proj,".htseq_counts.tsv.gz"),data.table = F) #全部是symbol
#head(dat)
#tail(dat)
a = dat[60484:60488,]
#dat的后五行是啥啊
#数据是log2(count+1),因此需要逆转并cpm处理
dat = dat[1:60483,]
rownames(dat) = dat$Ensembl_ID
a = dat
a = a[,-1]
##逆转 log
a = as.matrix(2^a - 1)
a[1:4,1:4]
# TCGA-BP-4342-01A TCGA-A3-3387-01A TCGA-BP-4167-01A TCGA-B8-4620-01A
#ENSG00000000003.13 2847 2261 2170 4586
#ENSG00000000005.5 2 17 10 3
#ENSG00000000419.11 1585 1642 979 1903
#ENSG00000000457.12 1372 895 457 505
# 用apply转换为整数矩阵
# 是因为后续edgeR分析时需要用到整数吗
exp = apply(a, 2, as.integer)
exp[1:4,1:4] #行名消失
# TCGA-BP-4342-01A TCGA-A3-3387-01A TCGA-BP-4167-01A TCGA-B8-4620-01A
# [1,] 2847 2260 2170 4586
# [2,] 2 16 10 3
# [3,] 1585 1641 979 1902
# [4,] 1371 894 456 505
rownames(exp) = rownames(dat)
exp[1:4,1:4]
# TCGA-BP-4342-01A TCGA-A3-3387-01A TCGA-BP-4167-01A TCGA-B8-4620-01A
#ENSG00000000003.13 2847 2260 2170 4586
#ENSG00000000005.5 2 16 10 3
#ENSG00000000419.11 1585 1641 979 1902
#ENSG00000000457.12 1371 894 456 505
#cpm处理
exp= log(edgeR::cpm(exp)+1)
2.2 表达矩阵行名ID转换:
library(stringr)
head(rownames(exp))
#[1] "ENSG00000000003.13" "ENSG00000000005.5" "ENSG00000000419.11"
#[4] "ENSG00000000457.12" "ENSG00000000460.15" "ENSG00000000938.11"
library(AnnoProbe)
?annoGene
#怎么匹配不上呢?是因为ENSEMBL有小数?下面这句高级一点哈哈哈
rownames(exp) = substr(rownames(exp), 1, 15)
##rownames(exp) = str_split(rownames(exp),"\\.",simplify = T)[,1];head(rownames(exp))
#annoGene(rownames(exp),ID_type = "ENSEMBL")
re = annoGene(rownames(exp),ID_type = "ENSEMBL");head(re)
# SYMBOL biotypes ENSEMBL chr start end
#1 DDX11L1 transcribed_unprocessed_pseudogene ENSG00000223972 chr1 11869 14409
#2 WASH7P unprocessed_pseudogene ENSG00000227232 chr1 14404 29570
#3 MIR6859-1 miRNA ENSG00000278267 chr1 17369 17436
#4 MIR1302-2HG lncRNA ENSG00000243485 chr1 29554 31109
#6 FAM138A lncRNA ENSG00000237613 chr1 34554 36081
#7 OR4G4P unprocessed_pseudogene ENSG00000268020 chr1 52473 53312
library(tinyarray)
#trans_array: transform rownames for microarray or rnaseq expression matrix
exp = trans_array(exp,ids = re,from = "ENSEMBL",to = "SYMBOL")
exp[1:4,1:4]
# TCGA-BP-4342-01A TCGA-A3-3387-01A TCGA-BP-4167-01A TCGA-B8-4620-01A
#DDX11L1 1 0 0 0
#WASH7P 77 69 23 29
#MIR6859-1 0 3 1 2
#MIR1302-2HG 2 0 0 0
2.4 保存数据
save(exp,proj,clinical,surv,file = paste0(proj,".Rdata"))
1.加载数据和R包
rm(list = ls())
proj = "TCGA-KIRC"
load(paste0(proj,".Rdata"))
library(tidyr)
library(tibble)
library(ggpubr)
library(dplyr)
library(stringr)
gene = "KIFC1"
1.1 分组信息处理
#根据样本ID的第14-15位,给样本分组(tumor和normal)
table(str_sub(colnames(exp),14,15))
# 01 05 11
#534 1 72
#这里的14-15位代表的样本的情况,其中01代表Primary Soild Tumor;05代表Additional -New Primary;11代表正常样本
Group = ifelse(as.numeric(str_sub(colnames(exp),14,15)) < 10,'tumor','normal')
Group = factor(Group,levels = c("normal","tumor"))
table(Group)
# Group
# normal tumor
# 72 535
2.准备数据
#提取想要的基因数据
exp_g <- exp[gene,] #这里需要输入想要分析的基因
exp_g <- as.data.frame(t(exp_g))
#准备数据
dat = t(exp_g) %>%
as.data.frame() %>% #上两步是转置信息并表格化
rownames_to_column() %>% #把行名转成一列数据
mutate(Group)
dim(dat)
# [1] 603 3
pdat = dat%>%
pivot_longer(cols =(!rowname)&(!Group), #除了 rowname 和 Group 之外的所有列
names_to = "gene", #指定新的列名,用于存储原来列的名称
values_to = "count") #指定新的列名,用于存储原来列中的值
3.正态性检验-对每一组数据进行正态性检验
#分别提取normal和tumor数据
normality_N <- pdat[pdat$Group == "normal",]
normality_T <- pdat[pdat$Group == "tumor",]
#检验数据是否满足正态分布的常见方法有:
# Shapiro-Wilk检验,Kolmogorov-Smirnov检验,Anderson-Darling检验,Lilliefors检验,Jarque-Bera检验,D'Agostino's K-squared检验
#最常用以下两种
#Shapiro-Wilk 正态性检验:最常用的正态性检验方法之一。适用于小样本数据(通常认为小于30)。
#shapiro.test(normality_N$count)
#shapiro.test(normality_T$count)
#Kolmogorov-Smirnov (K-S)正态性检验:这是一种基于经验分布函数的检验方法,适用于大样本数据。
ks.test(normality_N$count, "pnorm", mean(normality_N$count), sd(normality_N$count))
# Exact one-sample Kolmogorov-Smirnov test
# data: normality_N$count
# D = 0.16794, p-value = 0.03038
# alternative hypothesis: two-sided
ks.test(normality_T$count, "pnorm", mean(normality_T$count), sd(normality_T$count))
# Asymptotic one-sample Kolmogorov-Smirnov test
# data: normality_T$count
# D = 0.059156, p-value = 0.0473
# alternative hypothesis: two-sided
#看p值,若满足p-value>0.05,则数据满足正态分布,若不满足则用非参数检验。
#同时我们注意到,即使是KS法对样本进行分析时,也还是会按照大样本的中样本量的多与少采取不同的分析方法“Exact”/“Asymptotic”。
#直方图
hist(normality_N$count, main="Histogram for frequency") #默认是probability = F, 频数直方图
hist(normality_N$count, main="Histogram for density", probability = T) #probability = T, 密度直方图,每个区间的频率除以该区间的宽度和总的数据点数量,从而得到密度
hist(normality_T$count, main="Histogram for frequency")
hist(normality_T$count, main="Histogram for density", probability = T)
#QQ图(Quantile-Quantile Plot):这是一种图形方法,可以直观地检查数据是否接近正态分布。
#在QQ图中,如果数据点紧密地遵循参考线(45度线),则数据接近正态分布
qqnorm(normality_N$count)
qqline(normality_N$count)
qqnorm(normality_T$count)
qqline(normality_T$count)
4.方差齐性检验
#方差性(方差齐性)的检验,常用的方法是:
# Bartlett's Test 适用于正态分布数据
bartlett.test(pdat$count ~ Group, data = pdat)
# Levene's Test 适用于不确定数据是否符合正态分布的情况
library(car)
leveneTest(pdat$count ~ Group, data = pdat)
#P值小于0.05时,方差不齐
#Levene's Test for Homogeneity of Variance (center = median)
# Df F value Pr(>F)
#group 1 9.5624 0.002077 **
# 605
#---
#Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
# 该数据需采用两样本的非参数检验-wilcox.test
# 绘制小提琴图和显著性标记
library(ggplot2)
library(ggpubr)
png("p.png",width = 1200,height = 1200,res = 300)
ggplot(data=pdat,aes(x=pdat$Group,y=pdat$count,
colour = pdat$Group))+ #fill参数不要设置,会不好看
geom_violin(#color = 'grey',
alpha = 0.8, #alpha = 0.8 参数控制着小提琴图的透明度。
scale = 'width',#小提琴宽度
#linewidth = 1, #外轮廓粗细
trim = TRUE)+ # trim = TRUE 参数控制着小提琴图的形状。
geom_boxplot(mapping=aes(x=pdat$Group,y=pdat$count,
colour=pdat$Group,fill=pdat$Group), #箱线图
alpha = 0.5,
size=1.5,
width = 0.3)+
geom_jitter(mapping=aes(x=pdat$Group,y=pdat$count,colour=pdat$Group), #散点
alpha = 0.3,size=3)+
scale_fill_manual(limits=c("normal","tumor"),
values =c("#9E9E9E","#F5C96B"))+
scale_color_manual(limits=c("normal","tumor"),
values=c("#9E9E9E","#F5C96B"))+ #颜色
geom_signif(mapping=aes(x=pdat$Group,y=pdat$count), # 不同组别的显著性
comparisons = list(c("normal","tumor")), # 哪些组进行比较
map_signif_level=T, # T显示显著性,F显示p value
tip_length=c(0,0),#把向下的帽子去掉,分组数乘以2
y_position = c(4.5), # 设置显著性线的位置高度
size=1, # 修改线的粗细
textsize = 4, # 修改显著性标记的大小
test = "wilcox.test", # 检验的类型,可以更改
color = "black")+ # 设置显著性线的颜色
theme_bw()+ #设置白色背景
guides(fill = guide_legend(title = "Group"), # 设置填充图例的标题
color = guide_legend(title = "Group"))+ # 设置颜色图例的标题
labs(title = "TCGA-KIRC", # 设置标题
x="",y= paste0("The mRNA expression of ",gene)) # 添加标题,x轴,y轴标签
dev.off()
#查看典型管家基因(如:GAPDH、ACTB)的表达量,如果表达量高于正常值,说明我们数据没问题。
exp['GAPDH',]
exp['ACTB',]
#随机取样基因,对比管家基因的表达
#我们可以看到我们数据中两个管家基因的表达量都偏高,符合预期。
cg = exp[sample(nrow(exp), 3), ]
boxplot(cg)
##所以数据也没啥问题
1.数据处理
表达矩阵需要tumor和normal,新表达矩阵数据命名为exprSet;临床信息需要进一步整理,成为生存分析需要的格式,新临床信息数据命名为meta。
#加载数据和R包
rm(list=ls())
proj = "TCGA-KIRC"
load(paste0(proj,".Rdata"))
library(stringr)
library(dplyr)
1.1 数据准备
#临床信息和表达矩阵取交集
rownames(clinical) <- clinical$submitter_id.samples
meta <- clinical
exprSet <- exp
p <- identical(rownames(meta),colnames(exprSet));p
if(!p) {
s = intersect(rownames(meta),colnames(exprSet))
exprSet = exprSet[,s]
meta = meta[s,]
}
1.2 选列并简化列名
tmp <- as.data.frame(colnames(meta))
#如果有其他感兴趣的指标可以多提取一些
meta = meta[,c(
'submitter_id.samples',
'neoplasm_histologic_grade',
'pathologic_T' ,
'pathologic_M',
'pathologic_N',
'age_at_index.demographic',
'gender.demographic' ,
'tumor_stage.diagnoses')]
colnames(meta)=c('ID','grade','T_stage','M_stage','N_stage','age','gender','stage')
1.3 简化、规范内容
#检查各列的信息是否规范 没有冗余信息 缺失的信息用NA代替
#——Stage——————————————————————————————————
table(meta$stage)
#not reported stage i stage ii stage iii stage iv
# 3 294 69 139 102
meta$stage = meta$stage %>%
str_replace("not reported",as.character(NA)) %>%
str_remove("stage ") %>%
str_to_upper() # 罗马数字1、2、3、4小写变大写
stage = meta$stage
#下面这段内容可以在合适的情况下启用
#stage = case_when(stage == 'I' ~ 'I',
# stage == 'II' ~ 'II',
# stage == 'III' ~ 'III',
# stage == 'IIIA' ~ 'III',
# stage == 'IIIB' ~ 'III',
# stage == 'IIIC' ~ 'III',
# stage == 'IV' ~ 'IV',
# stage == 'IVB' ~ 'IV',
# TRUE ~ as.character(T))
#table(stage) #如果上述这段启用了,后续需要赋值
#meta$stage = stage #如果上述这段启用了,后续需要赋值
#把缺失的值和不能确定的值都变为空值
#下面的代码是指把stage中不包含I的内容全部转换成字符型的NA数据
meta$stage[!str_detect(meta$stage,"I")]=as.character(NA) #na替换空
table(meta$stage,useNA = "always") #always的含义是总是计算NA
# I II III IV <NA>
# 294 69 139 102 3
#——T_stage——————————————————————————————————
T_stage = meta$T_stage
T_stage = case_when(T_stage == 'T1' ~ 'T1',
T_stage == 'T1a' ~ 'T1',
T_stage == 'T1b' ~ 'T1',
T_stage == 'T2' ~ 'T2',
T_stage == 'T2a' ~ 'T2',
T_stage == 'T2b' ~ 'T2',
T_stage == 'T3' ~ 'T3',
T_stage == 'T3a' ~ 'T3',
T_stage == 'T3b' ~ 'T3',
T_stage == 'T3c' ~ 'T3',
T_stage == 'T4' ~ 'T4',
TRUE ~ as.character(T)) #对于所有其他情况,保持原来的值不变。
table(T_stage)
#T_stage
# T1 T2 T3 T4
#302 84 208 13
meta$T_stage = T_stage
meta$T_stage[!str_detect(meta$T_stage,"T")]=as.character(NA) #na替换空
#meta$T_stage[str_detect(meta$T_stage,"TX")]=as.character(NA) #na替换空
#table(meta$T_stage)
#——grade————————————————————————————————————
table(meta$grade,useNA = "always")
# G1 G2 G3 G4 GX <NA>
# 3 15 259 235 90 5 0
meta$grade[!str_detect(meta$grade,"G")]=as.character(NA) #na替换空
table(meta$grade,useNA = "always")
# G1 G2 G3 G4 GX <NA>
# 15 259 235 90 5 3
#——M_stage——————————————————————————
table(meta$M_stage,useNA = "always")
# M0 M1 MX <NA>
# 2 477 97 31 0
meta$M_stage[!str_detect(meta$M_stage,"M")]=as.character(NA) #na替换空
meta$M_stage[str_detect(meta$M_stage,"MX")]=as.character(NA) #MX替换空
table(meta$M_stage,useNA = "always")
# M0 M1 <NA>
# 2 477 97 31
#——N_stage————————————————
meta$N_stage[!str_detect(meta$N_stage,"N")]=as.character(NA) #na替换空
meta$N_stage[str_detect(meta$N_stage,"NX")]=as.character(NA) #NX替换空
table(meta$N_stage,useNA = "always")
# N0 N1 <NA>
# 277 18 307
#——gender——————————————————————
table(meta$gender,useNA = "always")
#——age————————————————————
table(meta$age,useNA = "always")
meta$age = ifelse(as.numeric(str_sub(meta$age)) < 60,'<=60','>60')
table(meta$age,useNA = "always")
#<=60 >60 <NA>
# 276 331 0
1.4 实现表达矩阵与临床信息的匹配
#因为临床信息那一环节去除了没有生存的数据,所以样本量有了变化
#在保存之前重新匹配一下
rownames(meta) <- meta$ID
s = intersect(rownames(meta),colnames(exprSet));length(s)
#[1] 607
exprSet = exprSet[,s]
identical(rownames(meta),colnames(exprSet))
#> [1] TRUE
1.5 分组信息处理
#这次分组完只要后面顺序不变样本不删除就没啥问题
#这里的分组是按照exprSet的列名去分的哦! exprSet和meta的顺序是一致的
#根据样本ID的第14-15位,给样本分组(tumor和normal)
table(str_sub(colnames(exprSet),14,15))
#这里的14-15位代表的样本的情况,其中01代表Primary Soild Tumor;05代表Additional -New Primary;11代表正常样本
Group = ifelse(as.numeric(str_sub(colnames(exprSet),14,15)) < 10,'tumor','normal')
Group = factor(Group,levels = c("normal","tumor"))
table(Group)
# Group
# normal tumor
# 72 535
save(meta,exprSet,Group,surv,proj,file = paste0(proj,"_clinical_exp.Rdata"))
1.加载数据和R包
rm(list = ls())
proj = "TCGA-KIRC"
load(paste0(proj,"_clinical_exp.Rdata"))
library(tidyr)
library(tibble)
library(ggpubr)
library(dplyr)
#该部分内容除了修改gene信息以外,还需要注意修改临床参数。因为不同数据中clinical参数会有不同
gene = "KIFC1"
2.准备数据
#提取想要的基因数据
g <- exprSet[gene,]
meta <- cbind(meta,g)
#在把Group合并上去
meta <- cbind(meta,Group)
#把临床信息分组情况做好标注
meta$grade <- ifelse(meta$Group == "normal","Normal",meta$grade)
meta$stage <- ifelse(meta$Group == "normal","Normal",meta$stage)
meta$T_stage <- ifelse(meta$Group == "normal","Normal",meta$T_stage)
meta$N_stage <- ifelse(meta$Group == "normal","Normal",meta$N_stage)
meta$M_stage <- ifelse(meta$Group == "normal","Normal",meta$M_stage)
3.临床参数绘图
3.1 T_stage
# 多分组数据可采用kruskal-wallis test
# 其中两样本采用非参数检验-wilcox.test
#存在NA数据因此需要把数据单独提取出来之后去除NA行
meta_T <- na.omit(meta[,c(3,9)])
meta_T$T_stage <- factor(meta_T$T_stage, levels = c("Normal","T1","T2","T3","T4"))
# 绘制小提琴图和显著性标记
library(ggplot2)
library(ggpubr)
png(paste0(gene,"_T_stage.png"),width = 1200,height = 1200,res = 300)
ggplot(data=meta_T,aes(x=T_stage,y=g,colour = T_stage))+ #fill参数不要设置,会不好看
geom_violin(#color = 'grey',
alpha = 0.8, #alpha = 0.8 参数控制着小提琴图的透明度。
scale = 'width',#小提琴宽度
#linewidth = 1, #外轮廓粗细
trim = TRUE)+ # trim = TRUE 参数控制着小提琴图的形状。
geom_boxplot(mapping=aes(x=T_stage,y=g,colour = T_stage,fill=T_stage), #箱线图
alpha = 0.5,size=1.5,width = 0.3)+
geom_jitter(mapping=aes(x=T_stage,y=g,colour = T_stage), #散点
alpha = 0.3,size=3)+
scale_fill_manual(limits=c("Normal","T1","T2","T3","T4"),
values =c("#312613","#BFA16F","#FB8C82","#B0C8CA","#F7EEE2"))+
scale_color_manual(limits=c("Normal","T1","T2","T3","T4"),
values=c("#312613","#BFA16F","#FB8C82","#B0C8CA","#F7EEE2"))+ #颜色
geom_signif(mapping=aes(x=T_stage,y=g), # 不同组别的显著性
comparisons = list(c("Normal","T1"),
c("Normal","T2"),
c("Normal","T3"),
c("Normal","T4"),
c("T1","T2"),
c("T1","T3"),
c("T1","T4"),
c("T2","T3"),
c("T2","T4"),
c("T3","T4")), # 哪些组进行比较
map_signif_level=T, # T显示显著性,F显示p value
tip_length=c(0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0),#把向下的帽子去掉,分组数乘以2
y_position = c(4.5,4.8,5.1,5.4,5.7,6,6.3,6.6,6.9,7.2), # 设置显著性线的位置高度
size=0.5, # 修改线的粗细
textsize = 2, # 修改显著性标记的大小
test = "wilcox.test", # 检验的类型,可以更改
color = "black")+ # 设置显著性线的颜色
theme_bw()+ #设置白色背景
guides(fill = guide_legend(title = "Group"), # 设置填充图例的标题
color = guide_legend(title = "Group"))+ # 设置颜色图例的标题
labs(title = "TCGA-KIRC", # 设置标题
x="",y= paste0("The mRNA expression of ",gene)) # 添加标题,x轴,y轴标签
dev.off()
3.2 N_stage
# 多分组数据可采用kruskal-wallis test
# 其中两样本采用非参数检验-wilcox.test
#存在NA数据因此需要把数据单独提取出来之后去除NA行
meta_N <- na.omit(meta[,c(5,9)])
meta_N$N_stage <- factor(meta_N$N_stage, levels = c("Normal", "N0", "N1"))
# 绘制小提琴图和显著性标记
library(ggplot2)
library(ggpubr)
png(paste0(gene,"_N_stage.png"),width = 1200,height = 1200,res = 300)
ggplot(data=meta_N,aes(x=N_stage,y=g,colour = N_stage))+ #fill参数不要设置,会不好看
geom_violin(#color = 'grey',
alpha = 0.8, #参数控制着小提琴图的透明度。
scale = 'width',#小提琴宽度
#linewidth = 1, #外轮廓粗细
trim = TRUE #参数控制着小提琴图的形状
) +
geom_boxplot(mapping=aes(x=N_stage,y=g,colour = N_stage,fill=N_stage), #箱线图
alpha = 0.5,size=1.5,width = 0.3)+
geom_jitter(mapping=aes(x=N_stage,y=g,colour = N_stage),
alpha = 0.3,size=3)+#散点
scale_fill_manual(limits=c("Normal","N0","N1"),
values =c("#312613","#BFA16F","#FB8C82"))+
scale_color_manual(limits=c("Normal","N0","N1"),
values=c("#312613","#BFA16F","#FB8C82"))+ #颜色
geom_signif(mapping=aes(x=N_stage,y=g), # 不同组别的显著性
comparisons = list(c("Normal","N0"),
c("Normal","N1"),
c("N0","N1")), # 哪些组进行比较
map_signif_level=T, # T显示显著性,F显示p value
tip_length=c(0,0,0,0,0,0),#把向下的帽子去掉,分组数乘以2
y_position = c(4.5,4.8,5.1), # 设置显著性线的位置高度
size=0.5, # 修改线的粗细
textsize = 2, # 修改显著性标记的大小
test = "wilcox.test", # 检验的类型,可以更改
color = "black",
)+ # 设置显著性线的颜色
theme_bw()+ #设置白色背景
guides(fill = guide_legend(title = "Group"), # 设置填充图例的标题
color = guide_legend(title = "Group"))+ # 设置颜色图例的标题
labs(title = "TCGA-KIRC", # 设置标题
x="",y= paste0("The mRNA expression of ",gene)) # 添加标题,x轴,y轴标签
dev.off()
3.3 M_stage
# 多分组数据可采用kruskal-wallis test
# 其中两样本采用非参数检验-wilcox.test
#存在NA数据因此需要把数据单独提取出来之后去除NA行
meta_M <- na.omit(meta[,c(4,9)])
meta_M$M_stage <- factor(meta_M$M_stage, levels = c("Normal", "M0", "M1"))
# 绘制小提琴图和显著性标记
library(ggplot2)
library(ggpubr)
png(paste0(gene,"_M_stage.png"),width = 1200,height = 1200,res = 300)
ggplot(data=meta_M,aes(x=M_stage,y=g,colour = M_stage))+ #fill参数不要设置,会不好看
geom_violin(alpha = 0.8, #参数控制着小提琴图的透明度。
scale = 'width',#小提琴宽度
#linewidth = 1, #外轮廓粗细
trim = TRUE ) + #参数控制着小提琴图的形状
geom_boxplot(mapping=aes(x=M_stage,y=g,
colour = M_stage,fill=M_stage), #箱线图
alpha = 0.5,size=1.5,width = 0.3)+
geom_jitter(mapping=aes(x=M_stage,y=g,colour = M_stage), #散点
alpha = 0.3,size=3)+
scale_fill_manual(limits=c("Normal","M0","M1"),
values =c("#312613","#BFA16F","#FB8C82"))+
scale_color_manual(limits=c("Normal","M0","M1"),
values=c("#312613","#BFA16F","#FB8C82"))+ #颜色
geom_signif(mapping=aes(x=M_stage,y=g), # 不同组别的显著性
comparisons = list(c("Normal","M0"),
c("Normal","M1"),
c("M0","M1")), # 哪些组进行比较
map_signif_level=T, # T显示显著性,F显示p value
tip_length=c(0,0,0,0,0,0),#把向下的帽子去掉,分组数乘以2
y_position = c(4.5,4.8,5.1), # 设置显著性线的位置高度
size=0.5, # 修改线的粗细
textsize = 2, # 修改显著性标记的大小
test = "wilcox.test", # 检验的类型,可以更改
color = "black",
)+ # 设置显著性线的颜色
theme_bw()+ #设置白色背景
guides(fill = guide_legend(title = "Group"), # 设置填充图例的标题
color = guide_legend(title = "Group"))+ # 设置颜色图例的标题
labs(title = "TCGA-KIRC", # 设置标题
x="",y= paste0("The mRNA expression of ",gene)) # 添加标题,x轴,y轴标签
dev.off()
3.4 Grade_stage
# 多分组数据可采用kruskal-wallis test
# 其中两样本采用非参数检验-wilcox.test
#存在NA数据因此需要把数据单独提取出来之后去除NA行
meta_grade <- na.omit(meta[,c(2,9)])
meta_grade$grade <- factor(meta_grade$grade,
levels = c("Normal","G1", "G2","G3","G4")) #自动把不要的数据变NA
meta_grade <- na.omit(meta_grade)
#如果想要GX数据就修改参数和后面代码
#meta_grade$grade <- factor(meta_grade$grade, levels = c("Normal","G1", "G2","G3","G4","GX"))
# 绘制小提琴图和显著性标记
library(ggplot2)
library(ggpubr)
png(paste0(gene,"_grade.png"),width = 1200,height = 1200,res = 300)
ggplot(data=meta_grade,aes(x=grade,y=g,colour = grade))+ #fill参数不要设置,会不好看
geom_violin(alpha = 0.8, #参数控制着小提琴图的透明度。
scale = 'width',#小提琴宽度
#linewidth = 1, #外轮廓粗细
trim = TRUE #参数控制着小提琴图的形状
) +
geom_boxplot(mapping=aes(x=grade,y=g,colour = grade,fill=grade), #箱线图
alpha = 0.5,size=1.5,width = 0.3)+
geom_jitter(mapping=aes(x=grade,y=g,colour = grade), #散点
alpha = 0.3,size=3)+
scale_fill_manual(limits=c("Normal","G1","G2","G3","G4"),
values =c("#312613","#BFA16F","#FB8C82","#B0C8CA","#F7EEE2"))+
scale_color_manual(limits=c("Normal","G1","G2","G3","G4"),
values=c("#312613","#BFA16F","#FB8C82","#B0C8CA","#F7EEE2"))+ #颜色
geom_signif(mapping=aes(x=grade,y=g), # 不同组别的显著性
comparisons = list(c("Normal","G1"),
c("Normal","G2"),
c("Normal","G3"),
c("Normal","G4"),
c("G1","G2"),
c("G1","G3"),
c("G1","G4"),
c("G2","G3"),
c("G2","G4"),
c("G3","G4")), # 哪些组进行比较
map_signif_level=T, # T显示显著性,F显示p value
tip_length=c(0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0),#把向下的帽子去掉,分组数乘以2
y_position = c(4.5,4.8,5.1,5.4,5.7,6,6.3,6.6,6.9,7.2), # 设置显著性线的位置高度
size=0.5, # 修改线的粗细
textsize = 2, # 修改显著性标记的大小
test = "wilcox.test", # 检验的类型,可以更改
color = "black",
)+ # 设置显著性线的颜色
theme_bw()+ #设置白色背景
guides(fill = guide_legend(title = "Group"), # 设置填充图例的标题
color = guide_legend(title = "Group"))+ # 设置颜色图例的标题
labs(title = "TCGA-KIRC", # 设置标题
x="",y= paste0("The mRNA expression of ",gene)) # 添加标题,x轴,y轴标签
dev.off()
3.5 stage
# 多分组数据可采用kruskal-wallis test
# 其中两样本采用非参数检验-wilcox.test
#存在NA数据因此需要把数据单独提取出来之后去除NA行
meta_stage <- na.omit(meta[,c(8,9)])
meta_stage$stage <- factor(meta_stage$stage, levels = c("Normal","I", "II","III","IV"))
meta_stage <- na.omit(meta_stage) #这一步可以不删除,分期数会出现X
# 绘制小提琴图和显著性标记
library(ggplot2)
library(ggpubr)
png(paste0(gene,"_stage.png"),width = 1200,height = 1200,res = 300)
ggplot(data=meta_stage,aes(x=stage,y=g,colour = stage))+ #fill参数不要设置,会不好看
geom_violin(alpha = 0.8, #参数控制着小提琴图的透明度。
scale = 'width',#小提琴宽度
#linewidth = 1, #外轮廓粗细
trim = TRUE ) + #参数控制着小提琴图的形状
geom_boxplot(mapping=aes(x=stage,y=g,colour = stage,fill=stage), #箱线图
alpha = 0.5,size=1.5,width = 0.3)+
geom_jitter(mapping=aes(x=stage,y=g,colour = stage),alpha = 0.3,size=3)+#散点
scale_fill_manual(limits=c("Normal","I","II","III","IV"),
values =c("#312613","#BFA16F","#FB8C82","#B0C8CA","#F7EEE2"))+
scale_color_manual(limits=c("Normal","I","II","III","IV"),
values=c("#312613","#BFA16F","#FB8C82","#B0C8CA","#F7EEE2"))+ #颜色
geom_signif(mapping=aes(x=stage,y=g), # 不同组别的显著性
comparisons = list(c("Normal","I"),
c("Normal","II"),
c("Normal","III"),
c("Normal","IV"),
c("I","II"),
c("I","III"),
c("I","IV"),
c("II","III"),
c("II","IV"),
c("III","IV")), # 哪些组进行比较
map_signif_level=T, # T显示显著性,F显示p value
tip_length=c(0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0),#把向下的帽子去掉,分组数乘以2
y_position = c(4.5,4.8,5.1,5.4,5.7,6,6.3,6.6,6.9,7.2), # 设置显著性线的位置高度
size=0.5, # 修改线的粗细
textsize = 2, # 修改显著性标记的大小
test = "wilcox.test", # 检验的类型,可以更改
color = "black",
)+ # 设置显著性线的颜色
theme_bw()+ #设置白色背景
guides(fill = guide_legend(title = "Group"), # 设置填充图例的标题
color = guide_legend(title = "Group"))+ # 设置颜色图例的标题
labs(title = "TCGA-KIRC", # 设置标题
x="",y= paste0("The mRNA expression of ",gene)) # 添加标题,x轴,y轴标签
dev.off()
3.6 age
# 其中两样本采用非参数检验-wilcox.test/或者符合正态分布就用t-test
#存在NA数据因此需要把数据单独提取出来之后去除NA行
meta_age <- na.omit(meta[,c(6,9)])
meta_age <- meta_age[Group == "tumor",]
meta_age$age <- factor(meta_age$age, levels = c("<=60",">60"))
meta_age <- na.omit(meta_age) #这一步可以不删除,也许出现X数据,因子化之后变NA去除
# 绘制小提琴图和显著性标记
library(ggplot2)
library(ggpubr)
png(paste0(gene,"_age.png"),width = 1200,height = 1200,res = 300)
ggplot(data=meta_age,aes(x=age,y=g,colour = age))+ #fill参数不要设置,会不好看
geom_violin(alpha = 0.8, #参数控制着小提琴图的透明度。
scale = 'width',#小提琴宽度
#linewidth = 1, #外轮廓粗细
trim = TRUE ) + #参数控制着小提琴图的形状
geom_boxplot(mapping=aes(x=age,y=g,colour = age,fill=age), #箱线图
alpha = 0.5,size=1.5,width = 0.3)+
geom_jitter(mapping=aes(x=age,y=g,colour = age), #散点
alpha = 0.3,size=3)+
scale_fill_manual(limits=c("<=60",">60"),
values =c("#312613","#BFA16F"))+
scale_color_manual(limits=c("<=60",">60"),
values=c("#312613","#BFA16F"))+ #颜色
geom_signif(mapping=aes(x=age,y=g), # 不同组别的显著性
comparisons = list(c("<=60",">60")), # 哪些组进行比较
map_signif_level=T, # T显示显著性,F显示p value
tip_length=c(0,0),#把向下的帽子去掉,分组数乘以2
y_position = c(4.5), # 设置显著性线的位置高度
size=0.5, # 修改线的粗细
textsize = 2, # 修改显著性标记的大小
test = "wilcox.test", # 检验的类型,可以更改
color = "black",
)+ # 设置显著性线的颜色
theme_bw()+ #设置白色背景
guides(fill = guide_legend(title = "Group"), # 设置填充图例的标题
color = guide_legend(title = "Group"))+ # 设置颜色图例的标题
labs(title = "TCGA-KIRC", # 设置标题
x="",y= paste0("The mRNA expression of ",gene)) # 添加标题,x轴,y轴标签
dev.off()
3.7 gender
# 其中两样本采用非参数检验-wilcox.test/或者符合正态分布就用t-test
#存在NA数据因此需要把数据单独提取出来之后去除NA行
meta_gender <- na.omit(meta[,c(7,9)])
meta_gender <- meta_gender[Group == "tumor",]
meta_gender$gender <- factor(meta_gender$gender, levels = c("male","female"))
meta_gender <- na.omit(meta_gender) #这一步可以不删除,也许出现X数据,因子化之后变NA去除
# 绘制小提琴图和显著性标记
library(ggplot2)
library(ggpubr)
png(paste0(gene,"_gender.png"),width = 1200,height = 1200,res = 300)
ggplot(data=meta_gender,aes(x=gender,y=g,colour = gender))+ #fill参数不要设置,会不好看
geom_violin(alpha = 0.8, #参数控制着小提琴图的透明度。
scale = 'width',#小提琴宽度
#linewidth = 1, #外轮廓粗细
trim = TRUE ) + #参数控制着小提琴图的形状
geom_boxplot(mapping=aes(x=gender,y=g,colour = gender,fill=gender), #箱线图
alpha = 0.5,size=1.5,width = 0.3)+
geom_jitter(mapping=aes(x=gender,y=g,colour = gender), alpha = 0.3,size=3)+#散点
scale_fill_manual(limits=c("male","female"),
values =c("#312613","#BFA16F"))+
scale_color_manual(limits=c("male","female"),
values=c("#312613","#BFA16F"))+ #颜色
geom_signif(mapping=aes(x=gender,y=g), # 不同组别的显著性
comparisons = list(c("male","female")), # 哪些组进行比较
map_signif_level=T, # T显示显著性,F显示p value
tip_length=c(0,0),#把向下的帽子去掉,分组数乘以2
y_position = c(4.5), # 设置显著性线的位置高度
size=0.5, # 修改线的粗细
textsize = 2, # 修改显著性标记的大小
test = "wilcox.test", # 检验的类型,可以更改
color = "black",
)+ # 设置显著性线的颜色
theme_bw()+ #设置白色背景
guides(fill = guide_legend(title = "Group"), # 设置填充图例的标题
color = guide_legend(title = "Group"))+ # 设置颜色图例的标题
labs(title = "TCGA-KIRC", # 设置标题
x="",y= paste0("The mRNA expression of ",gene)) # 添加标题,x轴,y轴标签
dev.off()
1.生存和临床数据处理
表达矩阵只需要tumor数据,不要normal,将其去掉,新表达矩阵数据命名为exprSet;临床信息需要进一步整理,成为生存分析需要的格式,新临床信息数据命名为meta。
#加载数据和R包
rm(list = ls())
proj = "TCGA-KIRC"
load(paste0(proj,"_clinical_exp.Rdata"))
library(stringr)
1.1去除normal样本-先从临床数据取再跟表达矩阵取交集比较好
table(Group)
#Group
#normal tumor
# 72 535
meta <- meta[Group == "tumor",]
table(str_sub(rownames(meta),14,15)) #确认数据是否都是tumor
# 01 05
#534 1
1.2 生存和临床数据合并
library(dplyr)
nrow(surv)
#[1] 979
nrow(meta)
#[1] 535
meta = left_join(meta,surv,by = c("ID"= "sample"))
nrow(meta)
#[1] 535
meta <- meta[,!colnames(meta) == "X_PATIENT"] #去除多余列
1.3 样本信息过滤
#去掉生存信息不全或者生存时间小于30天的样本,样本纳排标准不唯一,且差别很大.
#仁者见仁智者见智,是否去除自己看着办
k1 = meta$OS.time >= 30;table(k1)
#k1
#FALSE TRUE
# 13 518
#NA的数据是一定要去除的,生存分析不能有NA数据
k2 = !(is.na(meta$OS.time)|is.na(meta$OS));table(k2)
#k2
#FALSE TRUE
# 4 531
meta = meta[k1&k2,]
1.4 选列并简化列名
colnames(meta)=c('ID','grade','T_stage','M_stage','N_stage','age','gender','stage','event','time')
1.5 生存数据简化及规范
#(1)生存时间 认清生存时间的单位(通常是月,也可以用天和年);
range(meta$time)
#[1] 36 4537
meta$time = meta$time/30 #改成月
#meta$time = meta$time/12 #还可以进一步改成年
range(meta$time)
#[1] 1.2000 151.2333
1.6 实现表达矩阵与临床信息的匹配
rownames(meta) <- meta$ID
s = intersect(rownames(meta),colnames(exprSet));length(s)
#[1] 518
exprSet = exprSet[,s]
#确认顺序
identical(rownames(meta),colnames(exprSet))
#[1] TRUE
save(meta,exprSet,proj,file = paste0(proj,"_sur_analysis.Rdata"))
1.输入数据
rm(list = ls())
proj = "TCGA-KIRC"
load(paste0(proj,"_sur_analysis.Rdata"))
ls() #显示当前载入的内容是哪些
exprSet[1:4,1:4]
str(meta)
library(stringr)
library(survival)
library(survminer)
#输入基因名
gene = "KIFC1"
2.修改基因行名
rownames(exprSet)= str_replace_all(rownames(exprSet),"-","_")
#for循环生存分析
#特异写了循环函数,假设基因数量多了之后可以修改后使用
for (g in gene) {
meta[[g]] = ifelse((exprSet[rownames(exprSet) == g,]) <
median(exprSet[rownames(exprSet) == g,]),'low','high')
meta[[g]] = factor(meta[[g]],levels = c("low","high"))
sfit <- survfit(as.formula(paste("Surv(time, event) ~", g)), data=meta)
ggsurvplot(sfit,pval=TRUE)
png(filename = paste0(g,"_median.png"), width = 8, height = 6, units = "in", res = 300)
p <- ggsurvplot(
sfit,
censor.shape="|",
censor.size = 4,
conf.int = TRUE,
conf.int.style = "ribbon",
conf.int.alpha = 0.2,
pval = TRUE,
palette = "jco",
#surv.median.line = "hv",
ggtheme =theme_bw(),
legend = "top",
legend.labs = c("Low","High"),
xlab = "OS_time(Month)",
ylab = "Survival probability",
title = paste0(g," Influence"),#将标题根据gene分别命名
break.x.by = 24,#24个月界限
break.y.by = 0.1,
risk.table = TRUE,
risk.table.col = "strata",
risk.table.height = 0.2,
risk.table.y.text = FALSE
)
print(p)
dev.off() # 关闭设备,完成保存
}
4.生存分析-最佳截断值法
#for循环生存分析
#特异写了循环函数,假设基因数量多了之后可以修改后使用
for (g in gene) {
meta[[g]] = exprSet[rownames(exprSet) == g,] #不需要转置哦。自动转置
res.cut = survminer::surv_cutpoint(meta,
time = "time",
event = "event",
variables = g)
cut = res.cut[["cutpoint"]][1, 1]
meta[[paste0(g,"_group")]] = ifelse(meta[[g]] < cut,'low','high')
meta[[paste0(g,"_group")]] = factor(meta[[paste0(g,"_group")]],levels = c('low','high'))
sfit <- survfit(as.formula(paste("Surv(time, event) ~", paste0(g,"_group"))), data=meta)
ggsurvplot(sfit,pval=TRUE)
png(filename = paste0(g,"_best_cut.png"), width = 8, height = 6, units = "in", res = 300)
p1 <- ggsurvplot(
sfit,
censor.shape="|",
censor.size = 4,
conf.int = TRUE,
conf.int.style = "ribbon",
conf.int.alpha = 0.2,
pval = TRUE,
palette = "jco",
#surv.median.line = "hv",
ggtheme =theme_bw(),
legend = "top",
legend.labs = c("Low","High"),
xlab = "OS_time(Month)",
ylab = "Survival probability",
title = paste0(g," Influence"),#将标题根据gene分别命名
break.x.by = 24,#24个月界限
break.y.by = 0.1,
risk.table = TRUE,
risk.table.col = "strata",
risk.table.height = 0.2,
risk.table.y.text = FALSE
)
print(p1)
dev.off() # 关闭设备,完成保存
}
5.生存分析-四分位法
#for循环生存分析
#特异写了循环函数,假设基因数量多了之后可以修改后使用
for (g in gene) {
meta[[g]] = exprSet[rownames(exprSet) == g,] #不需要转置哦。自动转置
# 计算四分位数
quartiles <- quantile(meta[[g]], probs = c(0.25, 0.5, 0.75))
# 根据四分位数分组
meta[[paste0(g, "_group")]] <- cut(meta[[g]],
breaks = c(-Inf, quartiles, Inf),
labels = c("Q1", "Q2", "Q3", "Q4"))
# 确保分组变量是因子并设置因子水平顺序
meta[[paste0(g, "_group")]] <- factor(meta[[paste0(g, "_group")]], levels = c("Q1", "Q2", "Q3", "Q4"))
sfit <- survfit(as.formula(paste("Surv(time, event) ~", paste0(g,"_group"))), data=meta)
ggsurvplot(sfit,pval=TRUE)
png(filename = paste0(g,"_quantile.png"), width = 8, height = 8, units = "in", res = 300)
p1 <- ggsurvplot(
sfit,
censor.shape="|",
censor.size = 4,
conf.int = TRUE,
conf.int.style = "ribbon",
conf.int.alpha = 0.2,
pval = TRUE,
palette = "jco",
#surv.median.line = "hv",
ggtheme =theme_bw(),
legend = "top",
legend.labs = c("Q1", "Q2", "Q3", "Q4"),
xlab = "OS_time(Month)",
ylab = "Survival probability",
title = paste0(g," Influence"),#将标题根据gene分别命名
break.x.by = 24,#24个月界限
break.y.by = 0.1,
risk.table = TRUE,
risk.table.col = "strata",
risk.table.height = 0.2,
risk.table.y.text = FALSE
)
print(p1)
dev.off() # 关闭设备,完成保存
}
小结:这样的一套代码就可以基本解决单基因分析时常见的问题啦~。
致谢:感谢曾老师,小洁老师以及生信技能树团队全体成员(部分代码来源:生信技能树马拉松和数据挖掘课程)。
注:若对内容有疑惑或者有发现明确错误的朋友,请联系后台(希望多多交流)。
- END -
https://mp.weixin.qq.com/s/54B5ma2o-YolsHWdH0Ockw