Closed ixxmu closed 2 years ago
做完jimmy老师最优秀的学徒,我居然好久没有做复现任务了,好多函数都忘了。。
趁此机会(2021-07暑期时间)好好复习一下,fighting!
数据来源[1]于细胞学万能老二的期刊《nature cell biology》的一篇文章《Transcriptional diversity and bioenergetic shift in human breast cancer metastasis revealed by single-cell RNA sequencing》[2],提到了乳腺癌转移过程中的mitochondrial oxidative phosphorylation (OXPHOS) 通路激活问题。
同时作者上传了自己的code到github https://github.com/lawsonlab/Single_Cell_Metastasis/blob/master/Single_Cell_Classifier.R,文章总共有1707个细胞,经过筛选后剩下1119个细胞:
作者这里用的是seurat 2.1,但是现在更新到4了=-=
数据下载预处理
作者上传的数据已经是FPKM处理好的,可以看到是3个样品,这里我直接下的raw文件进行合并操作
rm(list = ls())
library(dplyr)
library(Seurat)
library(patchwork)
###list所有文件
a = list.files("GSE123837_RAW/")
dir = paste("./GSE123837_RAW/",a,sep="")
n = length(dir)
merge.data = read.table(file = dir[1],header=TRUE,dec = ".")
###合并txt
for (i in 2:n){
new.data = read.table(file = dir[i], header=TRUE, dec = ".")
merge.data = rbind(merge.data,new.data)
}
write.table(merge.data,file = "./merge.txt",row.names=F)
因为有1707个文件,实在太多=--=,先去吃饭了
结果报错
有2个问题:
1.发现不应该用rbind,应该用cbind=-= 浪费了好多时间2.有一个样本最后一行没有2行,作者传失误了?删去就好啦
rm(list = ls())
library(dplyr)
library(Seurat)
library(patchwork)
###list所有文件
a = list.files("GSE123837_RAW/")
dir = paste("./GSE123837_RAW/",a,sep="")
n = length(dir)
merge.data = read.table(file = dir[1],header=TRUE,dec = ".",row.names = 1)
###合并txt
for (i in 2:n){
new.data = read.table(file = dir[i], header=TRUE, dec = ".",row.names = 1)
colnames(new.data)=gsub("./GSE123837_RAW/","",dir[i])
merge.data = cbind(merge.data,new.data)
}
merge.data[1:4,1:2]
colnames(merge.data)[1]=a[1]
dim(merge.data)
#[1] 60155 1707
rm(list = ls())
library(dplyr)
library(Seurat)
library(patchwork)
###list所有文件
a = list.files("GSE123837_RAW/")
dir = paste("./GSE123837_RAW/",a,sep="")
n = length(dir)
merge.data = read.table(file = dir[1],header=TRUE,dec = ".",row.names = 1)
###合并txt
for (i in 2:n){
new.data = read.table(file = dir[i], header=TRUE, dec = ".",row.names = 1)
colnames(new.data)=gsub("./GSE123837_RAW/","",dir[i])
merge.data = cbind(merge.data,new.data)
}
merge.data[1:4,1:2]
colnames(merge.data)[1]=a[1]
dim(merge.data)
#[1] 60155 1707
save(merge.data,file = "raw.Rdata")
###质控
###id转换
library (org.Hs.eg.db)
keytypes(org.Hs.eg.db)
library(clusterProfiler)
load("raw.Rdata")
row.names(merge.data)=unlist(sapply(row.names(merge.data), function(x){
str_split(x,"\\.")[[1]][1]
}))
dim ( merge.data )
#[1] 60155 1707
ids1 =data.frame ( ID = c ( 1 : nrow ( merge.data) ) ,
ensembl_id = rownames ( merge.data) )
ids2 =merge ( toTable ( org.Hs.egENSEMBL ) ,
toTable ( org.Hs.egSYMBOL ) , by = "gene_id" )
ids =merge ( ids1 , ids2 , by = "ensembl_id" )
merge.data=merge.data[ ids$ID , ]
table(table(ids$symbol))
merge.data$symbol=ids$symbol##去重复
library(stringr)
library(limma)
merge.data = as.data.frame(avereps(merge.data[,-ncol(merge.data)],ID = merge.data$symbol) )
dim ( merge.data)
#[1] 25621 1707
group=ifelse(str_detect(colnames(merge.data),"Met"),"M",ifelse(str_detect(colnames(merge.data),"Tum"),"T","U"))
table(group)
##group
#M T U
#384 372 951
###seurat对象
group=as.data.frame(group)
rownames(group)=colnames(merge.data)
scRNA=CreateSeuratObject(counts = merge.data,min.cells = 8,meta.data = group)
###质控
scRNA [["percent.mt"]] = PercentageFeatureSet ( scRNA, pattern = "^MT-" ) ###计算线粒体含量
VlnPlot(scRNA, features = c("nFeature_RNA", "nCount_RNA", "percent.mt"), ncol = 3)
scRNA = subset( scRNA , subset = nFeature_RNA > 2500 & percent.mt < 50 )
###文章筛选条件的2500基因表达的细胞,最少在8个细胞表达以及比列小于50
VlnPlot(scRNA, features = c("nFeature_RNA", "nCount_RNA", "percent.mt"), ncol = 3)
dim(scRNA)
#[1] 25448 1165
文中是1119,有可能是上一步id转换已经粗暴的去掉了=-=
所以需要查看gencode的gtf,查看核糖体基因
gene_id "ENSG00000210049.1"; gene_name "MT-TF"; hgnc_id "HGNC:7481"
gene_id "ENSG00000211459.2"; gene_name "MT-RNR1"; hgnc_id "HGNC:7470"
gene_id "ENSG00000210077.1"; gene_name "MT-TV"; hgnc_id "HGNC:7500"
gene_id "ENSG00000210082.2"; gene_name "MT-RNR2"; hgnc_id "HGNC:7471"
gene_id "ENSG00000209082.1"; gene_name "MT-TL1"; hgnc_id "HGNC:7490"
gene_id "ENSG00000198888.2"; gene_name "MT-ND1"; hgnc_id "HGNC:7455"
gene_id "ENSG00000210100.1"; gene_name "MT-TI"; hgnc_id "HGNC:7488"
gene_id "ENSG00000210107.1"; gene_name "MT-TQ"; hgnc_id "HGNC:7495"
gene_id "ENSG00000210112.1"; gene_name "MT-TM"; hgnc_id "HGNC:7492"
gene_id "ENSG00000198763.3"; gene_name "MT-ND2"; hgnc_id "HGNC:7456"
gene_id "ENSG00000210117.1"; gene_name "MT-TW"; hgnc_id "HGNC:7501"
gene_id "ENSG00000210127.1"; gene_name "MT-TA"; hgnc_id "HGNC:7475"
gene_id "ENSG00000210135.1"; gene_name "MT-TN"; hgnc_id "HGNC:7493"
gene_id "ENSG00000210140.1"; gene_name "MT-TC"; hgnc_id "HGNC:7477"
gene_id "ENSG00000210144.1"; gene_name "MT-TY"; hgnc_id "HGNC:7502"
gene_id "ENSG00000198804.2"; gene_name "MT-CO1"; hgnc_id "HGNC:7419"
gene_id "ENSG00000210151.2"; gene_name "MT-TS1"; hgnc_id "HGNC:7497"
gene_id "ENSG00000210154.1"; gene_name "MT-TD"; hgnc_id "HGNC:7478"
gene_id "ENSG00000198712.1"; gene_name "MT-CO2"; hgnc_id "HGNC:7421"
gene_id "ENSG00000210156.1"; gene_name "MT-TK"; hgnc_id "HGNC:7489"
gene_id "ENSG00000228253.1"; gene_name "MT-ATP8"; hgnc_id "HGNC:7415"
gene_id "ENSG00000198899.2"; gene_name "MT-ATP6"; hgnc_id "HGNC:7414"
gene_id "ENSG00000198938.2"; gene_name "MT-CO3"; hgnc_id "HGNC:7422"
gene_id "ENSG00000210164.1"; gene_name "MT-TG"; hgnc_id "HGNC:7486"
gene_id "ENSG00000198840.2"; gene_name "MT-ND3"; hgnc_id "HGNC:7458"
gene_id "ENSG00000210174.1"; gene_name "MT-TR"; hgnc_id "HGNC:7496"
gene_id "ENSG00000212907.2"; gene_name "MT-ND4L"; hgnc_id "HGNC:7460"
gene_id "ENSG00000198886.2"; gene_name "MT-ND4"; hgnc_id "HGNC:7459"
gene_id "ENSG00000210176.1"; gene_name "MT-TH"; hgnc_id "HGNC:7487"
gene_id "ENSG00000210184.1"; gene_name "MT-TS2"; hgnc_id "HGNC:7498"
gene_id "ENSG00000210191.1"; gene_name "MT-TL2"; hgnc_id "HGNC:7491"
gene_id "ENSG00000198786.2"; gene_name "MT-ND5"; hgnc_id "HGNC:7461"
gene_id "ENSG00000198695.2"; gene_name "MT-ND6"; hgnc_id "HGNC:7462"
gene_id "ENSG00000210194.1"; gene_name "MT-TE"; hgnc_id "HGNC:7479"
gene_id "ENSG00000198727.2"; gene_name "MT-CYB"; hgnc_id "HGNC:7427"
gene_id "ENSG00000210195.2"; gene_name "MT-TT"; hgnc_id "HGNC:7499"
gene_id "ENSG00000210196.2"; gene_name "MT-TP"; hgnc_id "HGNC:7494"
rm(list = ls())
library(dplyr)
library(Seurat)
library(patchwork)
###list所有文件
a = list.files("GSE123837_RAW/")
dir = paste("./GSE123837_RAW/",a,sep="")
n = length(dir)
merge.data = read.table(file = dir[1],header=TRUE,dec = ".",row.names = 1)
###合并txt
for (i in 2:n){
new.data = read.table(file = dir[i], header=TRUE, dec = ".",row.names = 1)
colnames(new.data)=gsub("./GSE123837_RAW/","",dir[i])
merge.data = cbind(merge.data,new.data)
}
merge.data[1:4,1:2]
colnames(merge.data)[1]=a[1]
dim(merge.data)
#[1] 60155 1707
save(merge.data,file = "raw.Rdata")
###质控
###id转换
library (org.Hs.eg.db)
keytypes(org.Hs.eg.db)
library(clusterProfiler)
load("raw.Rdata")
row.names(merge.data)=unlist(sapply(row.names(merge.data), function(x){
str_split(x,"\\.")[[1]][1]
}))
dim ( merge.data )
#[1] 60155 1707
ids1 =data.frame ( ID = c ( 1 : nrow ( merge.data) ) ,
ensembl_id = rownames ( merge.data) )
ids2 =merge ( toTable ( org.Hs.egENSEMBL ) ,
toTable ( org.Hs.egSYMBOL ) , by = "gene_id" )
ids2=ids2[,c(2,3)]
###加入核糖体基因
ids3 =read.csv("mt.txt",sep = " ",header = F)
ids3=ids3[,c(2,4)]
ids3=as.data.frame(apply(ids3,2,function(x){
gsub(";","",x)
}))
ids3$V2=unlist(sapply(ids3$V2, function(x){
str_split(x,"\\.")[[1]][1]
}))
colnames(ids3)=colnames(ids2)
ids2=rbind(ids2,ids3)
ids =merge ( ids1 , ids2 , by = "ensembl_id")
merge.data=merge.data[ ids$ID , ]
table(table(ids$symbol))
merge.data$symbol=ids$symbol##去重复
library(stringr)
library(limma)
merge.data = as.data.frame(avereps(merge.data[,-ncol(merge.data)],ID = merge.data$symbol) )
dim ( merge.data)
#[1] 25658 1707 多了一些
group=ifelse(str_detect(colnames(merge.data),"Met"),"M",ifelse(str_detect(colnames(merge.data),"Tum"),"T","U"))
table(group)
##group
#M T U
#384 372 951
###seurat对象
group=as.data.frame(group)
rownames(group)=colnames(merge.data)
scRNA=CreateSeuratObject(counts = merge.data,min.cells = 24,meta.data = group)
###质控
scRNA [["percent.mt"]] = PercentageFeatureSet ( scRNA, pattern = "^MT-" ) ###计算线粒体含量
VlnPlot(scRNA, features = c("nFeature_RNA", "nCount_RNA", "percent.mt"), ncol = 3)
scRNA = subset( scRNA , subset = nFeature_RNA > 2500 & percent.mt < 50 )
VlnPlot(scRNA, features = c("nFeature_RNA", "nCount_RNA", "percent.mt"), ncol = 3)
dim(scRNA)
#[1] 25448 1158
结果还是不对,原文是3个数据集分开处理的。明天我再试试分开,结果是否一致=-=
这里不写循环的原因是因为分组的编号不一样,并且作者已经做好基因注释
H1
rm(list = ls())
library(Seurat)
h1 <- read.csv("GSE123837_HCI001_combinedmatrix.geneIDensemblID_FPKM_Submission.txt.gz",sep = "",row.names = 1,header = T)
group=ifelse(str_detect(colnames(h1),"LU"),"M",ifelse(str_detect(colnames(h1),"PT"),"T","U"))
dim(h1)
#[1] 60155 375
table(group)
# group
# M T
# 183 192
group=as.data.frame(group)
rownames(group)=colnames(h1)
scRNA=CreateSeuratObject(counts = h1,min.cells = 8,meta.data = group)
###质控
scRNA [["percent.mt"]] = PercentageFeatureSet ( scRNA, pattern = "^MT-" ) ###计算线粒体含量
VlnPlot(scRNA, features = c("nFeature_RNA", "nCount_RNA", "percent.mt"), ncol = 3)
scRNA = subset( scRNA , subset = nFeature_RNA > 2500 & percent.mt < 50 )
VlnPlot(scRNA, features = c("nFeature_RNA", "nCount_RNA", "percent.mt"), ncol = 3)
dim(scRNA)
#[1] 16846 246
H1还剩下246个cells
H2
结果h2的读取,遇到了日期基因=-=,作者传的文件还是挺多坑的,需要去除掉这些日期基因
library(stringr)
h2 <- read.csv("GSE123837_HCI002_combinedmatrix.geneIDensemblID_FPKM_Submission.txt.gz",sep = "",row.names = NULL)
a=c("Dec","Mar","Sep","Jan","Feb","Apr","May","Jun","Jul","Aug","Oct","Nov")
table(h2$row.names)
n=h2[str_detect(h2$row.names,"Dec"),]
###顺便练习for循环了
for (i in 2:length(a)) {
h=h2[str_detect(h2$row.names,a[i]),]
n=rbind(n,h)
}
#
h3=h2[!(h2$row.names%in%n$row.names),]
row.names(h3)=h3$row.names
h3=h3[,-1]
dim(h3)
#1] 60127 576
可以看到,与h1相比,少了28个基因,因为这28个因为作者的疏忽变成日期基因了
group=ifelse(str_detect(colnames(h3),"LU"),"M",ifelse(str_detect(colnames(h3),"PT"),"T","U"))
table(group)
#group
#M T U
#171 393 12
h2分组的时候可以看到有12个cells命名跟之前的明明不一致,这个只能回到gse的网页去查看所属分组
发现是M,所以全部算作M
group=gsub("U","M",group)
table(group)
# group
# M T
# 183 393
group=as.data.frame(group)
rownames(group)=colnames(h3)
scRNA=CreateSeuratObject(counts = h3,min.cells = 8,meta.data = group)
###质控
scRNA [["percent.mt"]] = PercentageFeatureSet ( scRNA, pattern = "^MT-" ) ###计算线粒体含量
VlnPlot(scRNA, features = c("nFeature_RNA", "nCount_RNA", "percent.mt"), ncol = 3)
scRNA = subset( scRNA , subset = nFeature_RNA > 2500 & percent.mt < 50 )
VlnPlot(scRNA, features = c("nFeature_RNA", "nCount_RNA", "percent.mt"), ncol = 3)
dim(scRNA)
#[1] 16360 396
scRNA2=scRNA
H2还剩下396个细胞
H10
h10 <- read.csv("GSE123837_HCI010_combinedmatrix.geneIDensemblID_FPKM_Submission.txt.gz",sep = "",row.names = 1,header = T)
group=ifelse(str_detect(colnames(h10),"LU"),"M",ifelse(str_detect(colnames(h10),"PT"),"T","U"))
table(group)
#M T U
#300 372 84
同样需要去查看命名
colnames(h10)[1:36]
colnames(h10)[241:288]
group=gsub("U","M",group)
table(group)
#M T
#384 372
group=as.data.frame(group)
rownames(group)=colnames(h10)
scRNA=CreateSeuratObject(counts = h10,min.cells = 8,meta.data = group)
###质控
scRNA [["percent.mt"]] = PercentageFeatureSet ( scRNA, pattern = "^MT-" ) ###计算线粒体含量
VlnPlot(scRNA, features = c("nFeature_RNA", "nCount_RNA", "percent.mt"), ncol = 3)
scRNA = subset( scRNA , subset = nFeature_RNA > 2500 & percent.mt < 50 )
VlnPlot(scRNA, features = c("nFeature_RNA", "nCount_RNA", "percent.mt"), ncol = 3)
dim(scRNA)
#[1] [1] 18798 470
scRNA10=scRNA
剩下470个cells
小结
总共三个样本最后所剩cell数为1112,与原文1119也是有所差异,与自己直接进行三个样本放在一起id转换也是有所差异。所以个人觉得原因如下
1.作者自己上传H2数据混入了日期基因,导致H2少了28个基因的表达,所以自己分样本结果比原文少2.不能粗暴的基因id转换,因为会去除很多基因,例如线粒体基因=-= 作者把60155个基因全部进行了注释,所以以后做单细胞,最好是通过gtf基因注释
下面是合并3个sc,这里与文章有所不同,因为文章用的是seurat2,现在3可以直接多数据通过anchor合并了,所以结果也许也会不同=--=
这里作者用cell cycle作为batch effects,所以需要去除影响
###合并
list1 <- list(scRNA1,scRNA2,scRNA10)
####官网建议先正态化与找hvg
for (i in 1:length(list1)) {
list1[[i]] <- NormalizeData(list1[[i]], verbose = FALSE)
list1[[i]] <- FindVariableFeatures(list1[[i]], selection.method = "vst",
nfeatures = 10000, verbose = FALSE)
}
anchors <- FindIntegrationAnchors(object.list = list1, dims = 1:30,anchor.features = 10000)##寻找anchor
mer <- IntegrateData(anchorset = anchors, dims = 1:30)##合并数据
# 读取表达矩阵的第一行作为表头,第一列作为行名
exp.mat <- read.table(file = "nestorawa_forcellcycle_expressionMatrix.txt", header = TRUE,
as.is = TRUE, row.names = 1)###周期基因文件可以再seurat的官网找到噢
# 细胞周期的Marker基因
s.genes <- cc.genes$s.genes
g2m.genes <- cc.genes$g2m.genes
marrow <- CellCycleScoring(mer, s.features = s.genes, g2m.features = g2m.genes, set.ident = TRUE)###计算得分
mer1 <- ScaleData(marrow, vars.to.regress = c("S.Score", "G2M.Score"))##缩放
通过作者给的阈值 logfc>0.25|p<0.05来进行筛选top15基因
###dif
diff_dat <- FindMarkers(mer1,ident.1="T",ident.2="M",
group.by='group')
save(mer1,diff,file = "merge.Rdata")
DEG=diff_dat
deg=DEG
## for volcano
if(T){
nrDEG=DEG
head(nrDEG)
attach(nrDEG)
plot(avg_logFC,-log10(p_val))
library(ggpubr)
df=nrDEG
df$v= -log10(p_val) #df新增加一列'v',值为-log10(p_val)
ggscatter(df, x = "logFC", y = "v",size=0.5)
df$g=ifelse(df$p_val>0.05,'stable', #if 判断:如果这一基因的p_val>0.01,则为stable基因
ifelse( df$avg_logFC>0.25,'up', #接上句else 否则:接下来开始判断那些p_val<0.01的基因,再if 判断:如果logFC >1.5,则为up(上调)基因
ifelse( df$avg_logFC < -0.25,'down','stable') )#接上句else 否则:接下来开始判断那些logFC <1.5 的基因,再if 判断:如果logFC <1.5,则为down(下调)基因,否则为stable基因
)
this_tile <- paste0('Cutoff for FoldChange is ',0.25,
'\nThe number of up gene is ',nrow(df[df$g =='up',]) ,
'\nThe number of down gene is ',nrow(df[df$g =='down',])
)
table(df$g)
df$name=rownames(df)
head(df)
g = ggplot(data=df,
aes(x=avg_logFC, y=-log10(p_val),
color=g)) +
geom_point(alpha=0.4, size=1.75) +
theme_set(theme_set(theme_bw(base_size=20)))+
xlab("logFC") + ylab("-log10 pvalue") +
ggtitle( this_tile ) + theme(plot.title = element_text(size=15,hjust = 0.5))+
scale_colour_manual(values = c('blue','black','red')) ## corresponding to the levels(res$change)
print(g)
ggsave('volcano.png')
## for heatmap
library(dplyr)
deg=deg[order(deg$avg_logFC,decreasing = T),]
DoHeatmap(mer1, features = rownames(deg)[-c(16:131)],group.by='group')
火山图
热图
可以看到与作者的结果有所差异
top15基因
作者说直接到网站下载的生存数据but可能付费用户才能下吧,我实在是找不到,所以我还是到metabric下载的数据(这里偷个懒,用Jimmy老师github的数据=-=)
rm(list = ls())
options(stringsAsFactors = F)
load("metabric_expression.Rdata")
load("metabric_clinical.Rdata")
table(clin$CLAUDIN_SUBTYPE)
###找到basal-like的分型的临床信息
tnbc=clin[clin$CLAUDIN_SUBTYPE%in%c("Basal","claudin-low"),]
dim(tnbc)
#[1] 427 20
###与表达谱做交集
tnbc$PATIENT_ID = gsub('-','.',tnbc$PATIENT_ID)
tnbc=tnbc[match(colnames(expr),tnbc$PATIENT_ID),]
tnbc=na.omit(tnbc)
dim(tnbc)
#[1] 394 20
最后有394个basal-like样本,与原文的360个样本接近
expr=as.data.frame(expr)
expr=expr[,colnames(expr)%in%tnbc$PATIENT_ID]
save(expr,tnbc,file = "meta_basal.Rdata")
后面将top15与转移相关的基因作为基因集来判断生存分析
需要自己制作 brca.gmt 文件哦,就是前面得到的基因集。
###先用gsva试一下
library(GSVA)
library("survival")
library("survminer")
gs=list(read.table("brca.gmt"))
es=gsva(expr,gs,mx.diff=TRUE)
tnbc=tnbc[tnbc$OS_STATUS %in% c('DECEASED','LIVING'),]
tnbc$event=ifelse(tnbc$OS_STATUS=='DECEASED',1,0)
tnbc$time=as.numeric(tnbc$OS_MONTHS)
group=ifelse(es[1,]>median(es[1,]),"high","low")
sfit <- survfit(Surv(time, event)~group, data=tnbc)
sfit
summary(sfit)
ggsurvplot(sfit, conf.int=F, pval=TRUE,pval.method=T)
生存确实具有显著性,不同的原因可能是
1.作者用的RFS,metabric只有OS,但是都具有显著性2.基因signature有所差异3.作者应该不是用的GSVA来算的,因为没有引用,code里面也没有提=-=,所以我也不知道作者具体用的什么方法来算基因集
[1]
数据来源: https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE123837[2]
《Transcriptional diversity and bioenergetic shift in human breast cancer metastasis revealed by single-cell RNA sequencing》: https://doi.org/10.1038/S41556-020-0477-0
https://mp.weixin.qq.com/s/8B_EAQL3fGsqkmVTYWOWbQ