一篇NC文献部分复现——小鼠正常肺和纤维化肺细胞的 scRNA-seq 分析

一篇NC文献部分复现——小鼠正常肺和纤维化肺细胞的 scRNA-seq 分析 by 单细胞天地

❝
应群里小伙伴要求，挑了一篇NC的文章来复现，文章中所用的单细胞数据集比较多，按照文章思路选用GSE132771数据集（小鼠正常肺和纤维化肺细胞的 scRNA-seq 分析）来做复现。
❞

文章简介：

胶原生成细胞（Collagen-producing cells）维持肺的复杂结构并驱动肺纤维化的病理性瘢痕形成。文章中进行scRNA-seq，以鉴定正常和纤维化肺中所有胶原生成细胞，在小鼠肺的不同隔室中具有不同解剖定位的多个胶原生成亚群的特征，一个以表达「Cthrc1」 (collagen triple helixrepeat containing 1) 为特征的亚群出现在纤维化肺中，表达最高水平的胶原。

文章数据集：

作者把小鼠和人样本上传到一块了，这里我们先选取他们的小鼠样本来做复现。

只用GSM3891612-GSM3891619的样本

样本处理：

使用Col1a1-EGFP (Col-GFP) 报告小鼠收集小鼠肺中所有的胶原生成细胞，通过气管内灌注博来霉素诱导2只Col-GFP小鼠肺纤维化，并用博来霉素治疗14天后，而后制备肺单细胞悬液，同时以2只未处理的Col-GFP小鼠作为对照。

复现的图：

导入数据

rm(list=ls())
options(stringsAsFactors = F) 
library(harmony)
library(ggsci)
library(dplyr) 
library(future)
library(Seurat)
library(clustree)
library(cowplot)
library(data.table)
library(dplyr)
library(ggplot2)
library(patchwork)
library(stringr)

###### step1:导入数据 ######   
dir='../GSE132771/' 
samples=list.files( dir )
samples 
sceList = lapply(samples,function(pro){ 
  # pro=samples[1] 
  print(pro)  
  sce = CreateSeuratObject(counts =  Read10X(file.path(dir,pro )) ,
                          project = gsub('^GSM[0-9]*_','',
                                         gsub('filtered_feature_bc_matrix','',pro) )  ,# pro, #
                          min.cells = 5,
                          min.features = 500 )
  print(sce)
  cid = tail(colnames(sce)[order(sce$nFeature_RNA)],8000)
  sce$nFeature_RNA[cid]
  sce=sce[,colnames(sce) %in% cid]
  print(sce)
  return(sce)
})
names(sceList) 

sce.all=merge(x=sceList[[1]],
              y=sceList[ -1 ],
              add.cell.ids =  gsub('^GSM[0-9]*_','',
               gsub('filtered_feature_bc_matrix','',samples))) 

as.data.frame(sce.all@assays$RNA@counts[1:10, 1:2])
head(sce.all@meta.data, 10)
table(sce.all$orig.ident) 
# 分组信息，通常是隐含在文件名，样品名字里面
phe=str_split( colnames(sce.all),'[-_]',simplify = T)
head(phe)
table(phe[,2])
sce.all$group= phe[,2]

table(sce.all@meta.data$group)
table(sce.all@meta.data$orig.ident)
sce.all$sample<-ifelse(grepl("GSM3891612|GSM3891613|GSM3891614|GSM3891615",sce.all$orig.ident),
                       "Bleomycin","Untreated")
table(sce.all$sample)

sce.all$sample2<-ifelse(grepl("GSM3891612|GSM3891613|GSM3891616|GSM3891617",sce.all$orig.ident),
                        "GFPp","GFPn")
table(sce.all$sample2)

as.data.frame(sce.all@assays$RNA@counts[1:10, 1:2])
head(sce.all@meta.data, 10)

质控步骤见该合集早期推文

降维分群及harmony

dir.create("2-harmony")
getwd()
setwd("2-harmony")

# sce.all=readRDS("../1-QC/sce.all_qc.rds")
sce=sce.all 
sce
sce <- NormalizeData(sce, 
                         normalization.method = "LogNormalize",
                         scale.factor = 1e4) 
sce <- FindVariableFeatures(sce)
sce <- ScaleData(sce)
sce <- RunPCA(sce, features = VariableFeatures(object = sce))

library(harmony)
seuratObj <- RunHarmony(sce, "orig.ident")
names(seuratObj@reductions)
seuratObj <- RunUMAP(seuratObj,  dims = 1:15, 
                     reduction = "harmony")
DimPlot(seuratObj,reduction = "umap",label=T ) 

sce=seuratObj
sce <- FindNeighbors(sce, reduction = "harmony",
                     dims = 1:15) 
sce.all=sce
#设置不同的分辨率，观察分群效果(选择哪一个？)
for (res in c(0.01, 0.05, 0.1, 0.2, 0.3, 0.5,0.8,1)) {
  sce.all=FindClusters(sce.all, #graph.name = "CCA_snn",
                       resolution = res, algorithm = 1)
}
colnames(sce.all@meta.data)
apply(sce.all@meta.data[,grep("RNA_snn",colnames(sce.all@meta.data))],2,table)
head(colnames(sce.all))
table(sce.all$orig.ident)
table(sce.all$group)

#接下来分析，按照分辨率为0.1进行 
sel.clust = "RNA_snn_res.0.1"
sce.all <- SetIdent(sce.all, value = sel.clust)
table(sce.all@active.ident) 
saveRDS(sce.all, "sce.all_int.rds")
getwd()
setwd('../')

dir.create("3-cell")
setwd("3-cell/")  
getwd()
DimPlot(sce.all, reduction = "umap",
        group.by = "sample2",label = T) 
ggsave('umap_by_RNA_snn_res.0.1_GFP.pdf',width=6,height=5)
DimPlot(sce.all, reduction = "umap", group.by = "seurat_clusters",label = T)

之后利用文章给出的marker gene来给细胞亚群命名。

DimPlot(sce.all, reduction = "umap", group.by = "RNA_snn_res.0.1",label = T) 
ggsave('umap_by_RNA_snn_res.0.1.pdf',width=6,height=5)
sce.all

library(ggplot2) 
genes_to_check=c("PECAM1","TEK","PTPRC","VAV1","EPCAM","CDH1","CD3E","CD19",
                 "ITGAX","SIGLE-CF","CD68","CCR2","LY6G","MSIN","CD86","ITGAE",
                 "NKG7","GZMB","PDGFRA","TCF21","CDH11")
library(stringr)  
genes_to_check=str_to_title(unique(genes_to_check))
genes_to_check
p_all_markers <- DotPlot(sce.all, features = genes_to_check,
                         assay='RNA')  + coord_flip()

p_all_markers
ggsave(plot=p_all_markers, filename="check_all_marker_by_seurat_cluster.pdf",width=6,height=8)

genes_to_check = c('PTPRC', 'CD3D', 'CD3E', 'CD4','CD8A','CD19', 'CD79A', 'MS4A1' ,
                   'IGHG1', 'MZB1', 'SDC1',
                   'CD68', 'CD163', 'CD14', 
                   'TPSAB1' , 'TPSB2',  # mast cells,
                   'MKI67','TOP2A','KLRC1',
                   'RCVRN','FPR1' , 'ITGAM' ,
                   'FGF7','MME', 'ACTA2',
                   'PECAM1', 'VWF',    
                   'KLRB1','NCR1', # NK 
                   'EPCAM' , 'KRT19', 'PROM1', 'ALDH1A1',
                   'MKI67' ,'TOP2A',"SiS","OLFM4","MUC2","CHGA","LYZ1","DCLK1" )
genes_to_check=str_to_title(unique(genes_to_check))
genes_to_check
p_all_markers2 <- DotPlot(sce.all, features = genes_to_check,
                         assay='RNA')  + coord_flip()
p_all_markers2+p_all_markers
ggsave('markers_gene.pdf',width = 16,height = 8)

# 需要自行看图,定细胞亚群：
# 文章里面的 ：  
  # 0：Mesenchymal
  # 1,7: Endo
  # 8,10,11,12: Epi
  # 3: Hematopoietic, NK-T
  # 4,6: Myeloid and Macrophage
  # 5: B
  # 2：fibo
  # 9: cycling 


celltype=data.frame(ClusterID=0:12 ,
                      celltype= 0:12) 
  #定义细胞亚群        
  celltype[celltype$ClusterID %in% c( 0),2]='Mesenchymal' 
  celltype[celltype$ClusterID %in% c( 1,7 ),2]='Endo' 
  celltype[celltype$ClusterID %in% c( 8,10,11,12   ),2]= 'Epi' # 'myeloids'   
  celltype[celltype$ClusterID %in% c( 3  ),2]='T' # 'myeloids'  
  celltype[celltype$ClusterID %in% c( 4,6  ),2]='Myeloid and Macrophage' # 'myeloids' 
  celltype[celltype$ClusterID %in% c( 5 ),2]='B'
  celltype[celltype$ClusterID %in% c( 2 ),2]='fibo'
  celltype[celltype$ClusterID %in% c(9),2]='cycling'

head(celltype)
celltype
table(celltype$celltype)

sce.all@meta.data$celltype = "NA"
for(i in 1:nrow(celltype)){
  sce.all@meta.data[which(sce.all@meta.data$RNA_snn_res.0.1 == celltype$ClusterID[i]),'celltype'] <- celltype$celltype[i]}
table(sce.all@meta.data$celltype)


th=theme(axis.text.x = element_text(angle = 45, 
                                    vjust = 0.5, hjust=0.5)) 

library(patchwork)
p_all_markers=DotPlot(sce.all, features = genes_to_check,
                      assay='RNA' ,group.by = 'celltype' )  + coord_flip()+th
p_umap=DimPlot(sce.all, reduction = "umap", group.by = "celltype",label = T,label.box = T)
p_all_markers+p_umap
ggsave('markers_umap_by_celltype.pdf',width = 16,height = 8)

expr <- sce.all@assays$RNA@data
gene_expression <- expr %>% .["Col1a1",] %>% as.data.frame()
colnames(gene_expression) <- "Col1a1"
gene_expression$cell <- rownames(gene_expression)
#Col1a1 postive
gene_expression_sel <- gene_expression[which(gene_expression$Col1a1>0),]
Col1a1 <- sce.all[,rownames(gene_expression_sel)]

FeaturePlot(sce.all,c('Col1a1'))
ggsave('umap_Col1a1.pdf',height = 7,width = 8)

setwd("../")
dir.create("./4-umap")
setwd("4-umap/")
#对Col1a1重新降维分群
sce<-Col1a1
sce <- NormalizeData(sce, 
                     normalization.method = "LogNormalize",
                     scale.factor = 1e4) 
sce <- FindVariableFeatures(sce)
sce <- ScaleData(sce)
sce <- RunPCA(sce, features = VariableFeatures(object = sce))

library(harmony)
seuratObj <- RunHarmony(sce, "orig.ident")
names(seuratObj@reductions)
seuratObj <- RunUMAP(seuratObj,  dims = 1:15, 
                     reduction = "harmony")
DimPlot(seuratObj,reduction = "umap",label=T ) 
ggsave('umap_Col1a1.pdf',height = 7,width = 8)

DimPlot(sce, reduction = "umap",
        group.by = "sample",label = T) 
ggsave('umap_by_group.pdf',height = 7,width = 8)

gene<-c("Col1a1","Acta2","Pdgfra","Pdgfrb","Myh11","Mcam","Cspg4")

p<-FeaturePlot(sce,features = gene,reduction = "umap",
               pt.size = 1,label = F,label.size = 5,cols = c("lightgrey","blue"))+
              scale_x_continuous("")+scale_y_continuous("")+
              theme_bw()+ #改变ggplot2的主题
               theme( #进一步修改主题
               panel.grid.major = element_blank(),
               panel.grid.minor = element_blank(), #去掉背景线
              plot.title = element_text(hjust = 0.5,size=15) #改变标题位置和字体大小
              );p

ggsave('featureplot.pdf',height = 8,width = 8)

推文就先复现到这里了。

微信交流群

如果你也想参与到群里的讨论中，给我提出一些推文更新建议的话欢迎入群。同时你也可以得到我前期更新的所有数据及代码，在群里我会把代码分享给大家，而且大家可以在群里「提出自己感兴趣的单细胞文献」「我们尽可能优先选择大家的数据集去复现和实战，也欢迎大家在群里分享更多其它资料，咱们一起进步，」「比较高级的单细胞分析」**我们也会一起尝试，相信你肯定是会不虚此行哈。

附上之前推文的链接为爱发电不可取（单细胞周更需要你的支持）

「「目前群里已经有近400人，所以你想要进群的话就需要添加我的微信，私聊给我发99元的红包，我把你拉进群里。」」

我们这个群是真正的付费交流群，提供数据，代码，学习资料，也会跟大家互动交流，还会坚持进行创作至少一年。如果介意99元的门槛且不认可我们知识分享的理念，「请不要加我好友」。

ixxmu / mp_duty