Closed ixxmu closed 1 year ago
这周推文是对一篇今年刚发的文章所用的数据集进行了复现,GSE210347,由于文章所用数据太大,用个人电脑进行复现请谨慎。
文章:Pan-cancer single-cell analysis reveals the heterogeneity and plasticity of cancer associated fibroblasts in the tumor microenvironment. https://doi.org/10.1038/s41467-022-34395-2
文献简介:肿瘤相关成纤维细胞(CAFs)是肿瘤微环境(TME)的主要组成部分,影响肿瘤标志物的表达,但目前尚未对其在不同肿瘤类型中的普遍存在特征进行系统研究。在这里,为了分析实体癌的TME情况,对10种实体癌的226个样本进行了泛癌分析,这些数据来自12项研究的164个供体的148个原发肿瘤、53个邻近肿瘤和25个正常样本。以描绘单细胞分辨率下的TME,说明异质性CAFs的共性/可塑性。
数据集: 总共纳入了232个单细胞转录组样本(正常= 31;相邻= 54;Tumor = 148)。其中,31个GSM样本来自GSE134355;11个GSM样本与GSE141445相关;49例样本来自EMBL-EBI的ArrayExpress数据库(www.ebi.ac.uk/arrayexpress),登录号为E-MTAB-8107;2例GSM样本与GSE157703关联;22个GSM样本来源于GSE131907;7个GSM样本来自GSE138709; 31例样本来自EMBL-EBI的ArrayExpress数据库(www.ebi.ac.uk/arrayexpress),登录号为E-MTAB-6149和E-MTAB-6653; 32例CRR样本与GSA数据库相关,登录号为CRA001160; 10个GSM样本与GSE154778相关; 7份HRR样本可在GSA-Human中获得,登录码为HRA000212; 10例甲状腺样本来源于GSA-Human数据库,登录号为HRA000686; 从http://dna-discovery.stanford.edu/download/1401/下载20份胃标本。
汇总好的数据也上传在GEO数据库中为GSE210347
文章复现图:
step1:导入数据
rm(list=ls())
options(stringsAsFactors = F)
library(Seurat)
library(ggplot2)
library(clustree)
library(cowplot)
library(dplyr)
###### step1:导入数据 ######
sce.all=readRDS("GSE210347_counts.Rds")
sce=CreateSeuratObject(sce.all)
table(sce$orig.ident)
head(colnames(sce))
tail(colnames(sce))
sce@assays$RNA@counts[1:10, 1:2]
table(sce$orig.ident)
sce$group<-ifelse(grepl("Colon|um|scrEXT",colnames(sce)),"colorectal",
ifelse(grepl("Bladder1|Bladder2|GallBladder1|GallBladder2",colnames(sce)),"bladder",
ifelse(grepl("Liver",colnames(sce)),"liver",
ifelse(grepl("Lung|Sample",colnames(sce)),"lung",
ifelse(grepl("Pancreas|PDAC",colnames(sce)),"PDAC",
ifelse(grepl("thyroid",colnames(sce)),"thyroid",
ifelse(grepl("gastric|Stomach",colnames(sce)),"gastric",
ifelse(grepl("prostate",colnames(sce)),"prostate", ifelse(grepl("BT",colnames(sce)),"ovarian","breast")))))))))
table(sce$group)
step2: QC 在此省略
step3:降维分群
#rm(list=ls())
dir.create("2-harmony")
getwd()
setwd("2-harmony")
sce=sce.all
sce
sce <- NormalizeData(sce,
normalization.method = "LogNormalize",
scale.factor = 1e4)
sce <- FindVariableFeatures(sce)
sce <- ScaleData(sce)
sce <- RunPCA(sce, features = VariableFeatures(object = sce))
library(harmony)
seuratObj <- RunHarmony(sce, "orig.ident")
names(seuratObj@reductions)
seuratObj <- RunUMAP(seuratObj, dims = 1:15,
reduction = "harmony")
DimPlot(seuratObj,reduction = "umap",label=T )
sce=seuratObj
sce <- FindNeighbors(sce, reduction = "harmony",
dims = 1:15)
sce.all=sce
#接下来分析,按照分辨率为0.3进行
sel.clust = "RNA_snn_res.0.3"
sce.all <- SetIdent(sce.all, value = sel.clust)
table(sce.all@active.ident)
saveRDS(sce.all, "sce.all_int.rds")
step4:检查常见分群情况
###### step5 检查常见分群情况 ######
dir.create("../3-cell")
setwd("../3-cell")
DimPlot(sce.all, reduction = "umap", group.by = "seurat_clusters",label = T)
DimPlot(sce.all, reduction = "umap", group.by = "RNA_snn_res.0.3",label = T)
ggsave('umap_by_RNA_snn_res.0.3.pdf')
library(ggplot2)
genes_to_check = c('PTPRC', 'CD3D', 'CD3E', 'CD4','CD8A',
'CD19', 'CD79A', 'MS4A1' ,
'IGHG1', 'MZB1', 'SDC1',
'CD68', 'CD163', 'CD14',
'TPSAB1' , 'TPSB2', # mast cells,
'RCVRN','FPR1' , 'ITGAM' ,
'C1QA', 'C1QB', # mac
'S100A9', 'S100A8', 'MMP19',# monocyte
'FCGR3A',
'LAMP3', 'IDO1','IDO2',## DC3
'CD1E','CD1C', # DC2
'KLRB1','NCR1', # NK
'FGF7','MME', 'ACTA2', ## fibo
'DCN', 'LUM', 'GSN' , ## mouse PDAC fibo
'Amy1' , 'Amy2a2', # Acinar_cells
'PECAM1', 'VWF', ## endo
'EPCAM' , 'KRT19', 'PROM1', 'ALDH1A1' )
library(stringr)
genes_to_check=str_to_upper(genes_to_check)
genes_to_check
p <- DotPlot(sce.all, features = unique(genes_to_check),
assay='RNA') + coord_flip()
p
ggsave('check_last_markers.pdf',height = 11,width = 11)
p_umap=DimPlot(sce.all, reduction = "umap",
group.by = "RNA_snn_res.0.3",label = T)
library(patchwork)
p+p_umap
ggsave('markers_umap.pdf',width = 20,height = 12)
DimPlot(sce.all, reduction = "umap",split.by = 'orig.ident',
group.by = "RNA_snn_res.0.3",label = T)
ggsave('orig.ident_umap.pdf',width = 30,height = 5)
DimPlot(sce.all, reduction = "umap",split.by = 'group',
group.by = "RNA_snn_res.0.3",label = T)
ggsave('group_umap.pdf',width = 30,height = 5)
step5:细胞亚群生物学命名
table(sce.all@meta.data$RNA_snn_res.0.3)
#细胞分群命名
celltype=data.frame(ClusterID=0:29,
celltype= 0:29)
#定义细胞亚群
celltype[celltype$ClusterID %in% c(0,15,17,19,21,22),2]='T cells'
celltype[celltype$ClusterID %in% c(16,26,9),2]='B cells'
celltype[celltype$ClusterID %in% c(18),2]='mast'
celltype[celltype$ClusterID %in% c(1,23,5,28),2]='myeloid'
celltype[celltype$ClusterID %in% c(4,6,7,10,11,12,13,24,25,27,14,20),2]='epi'
celltype[celltype$ClusterID %in% c(29,3),2]='Endo'
celltype[celltype$ClusterID %in% c(2,8),2]='fibo'
head(celltype)
celltype
table(celltype$celltype)
sce.all@meta.data$celltype = "NA"
for(i in 1:nrow(celltype)){
sce.all@meta.data[which(sce.all@meta.data$RNA_snn_res.0.3 == celltype$ClusterID[i]),'celltype'] <- celltype$celltype[i]}
table(sce.all@meta.data$celltype)
library(ggsci)
color = c(pal_d3("category20")(20),
pal_d3("category20b")(20),
pal_d3("category20c")(20),
pal_d3("category10")(10))
DimPlot(sce.all, reduction = "umap", group.by = "celltype",
cols = color,
pt.size = 1.5,
label = T,label.box = T
)
ggsave('celltype-umap_2.pdf',height = 6,width = 8)
library(stringr)
genes_to_check=str_to_upper(genes_to_check)
genes_to_check
th=theme(axis.text.x = element_text(angle = 45,
vjust = 0.5, hjust=0.5))
p <- DotPlot(sce.all, features = unique(genes_to_check) , assay='RNA' ,group.by = 'celltype' ) + coord_flip() +th
p
ggsave(plot=p, filename="check_marker_by_all.pdf",width = 10 ,height = 9)
step6: 根据10种实体癌症分组看细胞亚群分布
# 1.加载R包 ----
rm(list=ls())
library(ggplot2)
library(cowplot)
library(paletteer)
library(gplots)
library(ggpubr)
library(ggsci)
library(stringr)
# 2.设置工作路径 ----
#在meta信息中添加分组信息
getwd()
dir.create("../4_group")
setwd("../4_group/")
phe=sce.all@meta.data
# 4.可视化 ----
## 4.1 每种细胞类型中,分组所占比例 ----
library(tidyr)# 使用的gather & spread
library(reshape2) # 使用的函数 melt & dcast
library(dplyr)
tb=table(phe$celltype,
phe$group)
head(tb)
balloonplot(tb)
bar_data <- as.data.frame(tb)
bar_per <- bar_data %>%
group_by(Var1) %>%
mutate(sum(Freq)) %>%
mutate(percent = Freq / `sum(Freq)`)
head(bar_per)
write.csv(bar_per,file = "celltype_by_group_percent.csv")
library(ggplot2)
ggplot(bar_per, aes(x = Var1, y = percent)) +
geom_bar(aes(fill = Var2) , stat = "identity") + coord_flip() +
theme(axis.ticks = element_line(linetype = "blank"),
legend.position = "top",
panel.grid.minor = element_line(colour = NA,linetype = "blank"),
panel.background = element_rect(fill = NA),
plot.background = element_rect(colour = NA)) +
labs(y = "% Relative cell source", fill = NULL)+labs(x = NULL)+
scale_fill_d3()
#可视化
ggsave("celltype_by_group_percent.pdf",
units = "cm",width = 20,height = 12)
我们可以从细胞分布图中看出与上皮癌细胞的组织类型特异性分布相比,无论恶性状态和组织类型,TME均无明显偏倚。
除此之外也可从对不同实体肿瘤进行tumor, normal 和 adjacent分组入手进行分析。
文章中提取成纤维细胞亚群进一步分析,感兴趣的可以去看看。
https://mp.weixin.qq.com/s/_ZOTlI0QRszCJJa1ZAVSew