两样本原代乳腺癌细胞与肺部扩散癌细胞单细胞测序数据分析

ixxmu / mp_duty

抓取网络文章到github issues保存

122 stars 30 forks source link

两样本原代乳腺癌细胞与肺部扩散癌细胞单细胞测序数据分析 by 生信菜鸟团

这周跟大家分享一个乳腺癌两样本单细胞测序数据的背景知识和简单分析。

样本处理：

将 RFP-Luc 细胞【MCF7（原位ER阳性乳腺癌细胞系）】注射到NSG雌性小鼠的导管树中。导管内注射6个月后，处死2只小鼠;然后将乳腺和肺收集并消化成单个细胞，随后根据 RFP 对 FACS 进行分类。

FACS：Fluorescence-activated Cell Sorting 流式细胞荧光分选技术

RFP：Red Fluorecent Protein 红色荧光蛋白

当我看到单细胞测序数据是人的时候，有点疑惑将细胞注射进小鼠体内后再收集乳腺和肺的细胞，最后测序数据还是人。具体这个原理我还是没有想明白，平时所说的 PDX 模型是将人的肿瘤组织种到小鼠体内，而这个恰恰是将细胞注射进小鼠体内，还是有一定区别的，大家如果有相关背景知识可以留言，让我学习一下。

背景知识：雌激素受体 α 阳性 (ER+) 乳腺癌 (BCs) 占所有乳腺癌的70%以上，并构成了一个特殊的临床挑战，因为它们在最初诊断和治疗后长达几十年复发。由于缺乏足够的模型，控制肿瘤细胞休眠和潜伏疾病的机制仍然难以捉摸。在这里，比较了 ER+ 和三阴性 TNBC 亚型与临床相关的小鼠导管内异种移植的肿瘤进展。ER+ 和 TNBC 细胞在原位期就已经扩散。然而 TN 扩散性肿瘤细胞 DTCs 增殖速度与原发部位细胞相同，并可引起大转移。ER+ DTCs 增殖指数低，仅形成微转移，从而失去上皮特征。

数据集及链接：GSE196936 GEO Accession viewer (nih.gov)

数据：

GSM5904917 Primary_Breast_ Tumor_Cells      #原代乳腺肿瘤细胞
GSM5904918 Lung_Disseminated_Tumor_Cells    #肺部扩散性肿瘤细胞
#这周所用的数据集是csv格式的

数据处理：

这个数据集细胞量是比较少的，可能是由于分选过？

Lung Primary 
255    3272 
过滤后
Lung Primary 
 164    2092

导入数据

rm(list=ls())
options(stringsAsFactors = F) 
library(Seurat)
library(ggplot2)
library(clustree)
library(cowplot)
library(dplyr)

###### step1:导入数据 ######   
library(data.table)
setwd("../")
getwd()
dir='./GSE196936_RAW/' 
samples=list.files( dir )
samples

library(data.table)
sceList = lapply(samples,function(pro){ 
  # pro=samples[1] 
  print(pro) 
  #ct=fread( file.path(dir,pro),data.table = F) 
  sce =CreateSeuratObject(counts =  read.csv( file.path(dir,pro)) ,
                          project =   gsub('.csv','',gsub('^GSM[0-9]*_MCF7_','',pro) ) ,
                          min.cells = 5,
                          min.features = 300 )
  return(sce)
})
sceList 
samples
# gsub('^GSM[0-9]*','',samples) 
sce.all=merge(x=sceList[[1]],
              y=sceList[ -1 ],
              add.cell.ids =   gsub('.csv','',gsub('^GSM[0-9]*_MCF7_','',samples) )  )

as.data.frame(sce.all@assays$RNA@counts[1:10, 1:2])
head(sce.all@meta.data, 10)
table(sce.all$orig.ident)

library(stringr)
sce.all$group=sce.all$orig.ident
table(sce.all@meta.data$group)
table(sce.all@meta.data$orig.ident
#rm(list=ls())
###### step2:QC质控 ######
dir.create("./1-QC")
setwd("./1-QC")
# sce.all=readRDS("../sce.all_raw.rds")
#计算线粒体基因比例
# 人和鼠的基因名字稍微不一样 
mito_genes=rownames(sce.all)[grep("^MT-", rownames(sce.all))] 
mito_genes #13个线粒体基因
sce.all=PercentageFeatureSet(sce.all, "^MT-", col.name = "percent_mito")
fivenum(sce.all@meta.data$percent_mito)
#计算核糖体基因比例
ribo_genes=rownames(sce.all)[grep("^Rp[sl]", rownames(sce.all),ignore.case = T)]
ribo_genes
sce.all=PercentageFeatureSet(sce.all, "^RP[SL]", col.name = "percent_ribo")
fivenum(sce.all@meta.data$percent_ribo)
#计算红血细胞基因比例
rownames(sce.all)[grep("^Hb[^(p)]", rownames(sce.all),ignore.case = T)]
sce.all=PercentageFeatureSet(sce.all, "^HB[^(P)]", col.name = "percent_hb")
fivenum(sce.all@meta.data$percent_hb)
#可视化细胞的上述比例情况
feats <- c("nFeature_RNA", "nCount_RNA", "percent_mito", "percent_ribo", "percent_hb")
feats <- c("nFeature_RNA", "nCount_RNA")
p1=VlnPlot(sce.all, group.by = "orig.ident", features = feats, pt.size = 0, ncol = 2) + 
  NoLegend()
p1
library(ggplot2) 
ggsave(filename="Vlnplot1.pdf",plot=p1)
feats <- c("percent_mito", "percent_ribo", "percent_hb")
p2=VlnPlot(sce.all, group.by = "orig.ident", features = feats, pt.size = 0, ncol = 3, same.y.lims=T) + 
  scale_y_continuous(breaks=seq(0, 100, 5)) +
  NoLegend()
p2 
ggsave(filename="Vlnplot2.pdf",plot=p2)

p3=FeatureScatter(sce.all, "nCount_RNA", "nFeature_RNA", group.by = "orig.ident", pt.size = 0.5)
ggsave(filename="Scatterplot.pdf",plot=p3)
#根据上述指标，过滤低质量细胞/基因
#过滤指标1:最少表达基因数的细胞&最少表达细胞数的基因
selected_c <- WhichCells(sce.all, expression = nFeature_RNA > 300)
selected_f <- rownames(sce.all)[Matrix::rowSums(sce.all@assays$RNA@counts > 0 ) > 3]

sce.all.filt <- subset(sce.all, features = selected_f, cells = selected_c)
dim(sce.all) 
dim(sce.all.filt) 
#  可以看到，主要是过滤了基因，其次才是细胞
C=sce.all.filt@assays$RNA@counts
dim(C)
C=Matrix::t(Matrix::t(C)/Matrix::colSums(C)) * 100
# 这里的C 这个矩阵，有一点大，可以考虑随抽样
C=C[,sample(1:ncol(C),1000)]
most_expressed <- order(apply(C, 1, median), decreasing = T)[50:1]
pdf("TOP50_most_expressed_gene.pdf",width=14)
boxplot(as.matrix(Matrix::t(C[most_expressed, ])),
        cex = 0.1, las = 1, 
        xlab = "% total count per cell", 
        col = (scales::hue_pal())(50)[50:1], 
        horizontal = TRUE)
dev.off()
rm(C)
#过滤指标2:线粒体/核糖体基因比例(根据上面的violin图)
selected_mito <- WhichCells(sce.all.filt, expression = percent_mito < 15)
selected_ribo <- WhichCells(sce.all.filt, expression = percent_ribo > 3)
selected_hb <- WhichCells(sce.all.filt, expression = percent_hb < 1 )
length(selected_hb)
length(selected_ribo)
length(selected_mito)
sce.all.filt <- subset(sce.all.filt, cells = selected_mito)
sce.all.filt <- subset(sce.all.filt, cells = selected_ribo)
sce.all.filt <- subset(sce.all.filt, cells = selected_hb)
dim(sce.all.filt)
table(sce.all.filt$orig.ident) 

#可视化过滤后的情况
feats <- c("nFeature_RNA", "nCount_RNA")
p1_filtered=VlnPlot(sce.all.filt, group.by = "orig.ident", features = feats, pt.size = 0, ncol = 2) + 
  NoLegend()
ggsave(filename="Vlnplot1_filtered.pdf",plot=p1_filtered)

feats <- c("percent_mito", "percent_ribo", "percent_hb")
p2_filtered=VlnPlot(sce.all.filt, group.by = "orig.ident", features = feats, pt.size = 0, ncol = 3) + 
  NoLegend()
ggsave(filename="Vlnplot2_filtered.pdf",plot=p2_filtered)
#过滤指标3:过滤特定基因
# Filter MALAT1 管家基因
sce.all.filt <- sce.all.filt[!grepl("MALAT1", rownames(sce.all.filt),ignore.case = T), ]
# Filter Mitocondrial 线粒体基因
sce.all.filt <- sce.all.filt[!grepl("^MT-", rownames(sce.all.filt),ignore.case = T), ]
# 当然，还可以过滤更多
dim(sce.all.filt) 
dim(sce.all) 
saveRDS(sce.all.filt, "sce.all_qc2_noG.rds")
#sce.all.fit 212.5MB
setwd('../')
source('step2-harmony.R')
source('step3.1-check-marker.R')

降维分群

rm(list=ls())
dir.create("./20220901_GSE196936/2-harmony")
getwd()
setwd("./20220901_GSE196936/2-harmony")
sce.all=readRDS("./1-QC/sce.all_qc2_noG.rds")
sce=sce.all
sce
sce <- NormalizeData(sce, 
                         normalization.method = "LogNormalize",
                         scale.factor = 1e4) 
sce <- FindVariableFeatures(sce)
sce <- ScaleData(sce)
sce <- RunPCA(sce, features = VariableFeatures(object = sce))

library(harmony)
seuratObj <- RunHarmony(sce, "orig.ident")
names(seuratObj@reductions)
seuratObj <- RunUMAP(seuratObj,  dims = 1:15, 
                     reduction = "harmony")
DimPlot(seuratObj,reduction = "umap",label=T ) 

sce=seuratObj
sce <- FindNeighbors(sce, reduction = "harmony",
                     dims = 1:15) 
sce.all=sce
#设置不同的分辨率，观察分群效果
for (res in c(0.01, 0.05, 0.1, 0.2, 0.3, 0.5,0.8,1)) {
  sce.all=FindClusters(sce.all, #graph.name = "CCA_snn", 
                       resolution = res, algorithm = 1)
}
colnames(sce.all@meta.data)
apply(sce.all@meta.data[,grep("RNA_snn",colnames(sce.all@meta.data))],2,table)

p1_dim=plot_grid(ncol = 3, DimPlot(sce.all, reduction = "umap", group.by = "RNA_snn_res.0.01") + 
                   ggtitle("louvain_0.01"), DimPlot(sce.all, reduction = "umap", group.by = "RNA_snn_res.0.1") + 
                   ggtitle("louvain_0.1"), DimPlot(sce.all, reduction = "umap", group.by = "RNA_snn_res.0.2") + 
                   ggtitle("louvain_0.2"))
ggsave(plot=p1_dim, filename="Dimplot_diff_resolution_low.pdf",width = 14)

p1_dim=plot_grid(ncol = 3, DimPlot(sce.all, reduction = "umap", group.by = "RNA_snn_res.0.8") + 
                   ggtitle("louvain_0.8"), DimPlot(sce.all, reduction = "umap", group.by = "RNA_snn_res.1") + 
                   ggtitle("louvain_1"), DimPlot(sce.all, reduction = "umap", group.by = "RNA_snn_res.0.3") + 
                   ggtitle("louvain_0.3"))
ggsave(plot=p1_dim, filename="Dimplot_diff_resolution_high.pdf",width = 18)

p2_tree=clustree(sce.all@meta.data, prefix = "RNA_snn_res.")
ggsave(plot=p2_tree, filename="Tree_diff_resolution.pdf")

#接下来分析，按照分辨率为0.1进行 
sel.clust = "RNA_snn_res.0.1"
sce.all <- SetIdent(sce.all, value = sel.clust)
table(sce.all@active.ident) 
saveRDS(sce.all, "sce.all_int.rds")
sce.all=readRDS("./2-harmony/sce.all_int.rds")

检查常见分群情况

###### step5:检查常见分群情况  ######
setwd('../')
dir.create("3-cell")
setwd("3-cell")  
getwd()
DimPlot(sce.all, reduction = "umap", group.by = "seurat_clusters",label = T) 
DimPlot(sce.all, reduction = "umap", group.by = "RNA_snn_res.0.1",label = T) 
ggsave('umap_by_RNA_snn_res.0.1.pdf')

library(stringr)
#刚开始只找了一些基础marker分不太开
#genes_to_check = c('PTPRC', 'CD3D', 'CD3E', 'CD8A',
             # 'CD19', 'CD79A', 'MS4A1' , 'IGHG1', 
             # 'MZB1', 'SDC1',
                #   'CD68', 'CD163', 'CD14', 
                  # 'RCVRN',
                  # 'FGF7','MME',
                  # 'PECAM1', 'VWF', 
                #'EPCAM' , 'KRT19', 'PROM1', 'ALDH1A1')

#genes_to_check=unique(genes_to_check)
#genes_to_check = str_to_upper(genes_to_check)
#因此又多找了一些marker
genes_to_check = ("PTPRC","CD3D","CD3E","CD8A","CD19",
              "CD79A","MS4A1","IGHG1","MZB1","SDC1",
              "CD68","CD163","CD14","RCVRN","FGF7",    
              "MME", "PECAM1","VWF","EPCAM","KRT19",   
          "PROM1","ALDH1A1","AGER","SFTPA1","SCGB1A1", 
                "KRT17","TPPP3","KRT4","KRT14","KRT8", 
                "KRT18","CD4","CCR7","SELL","TCF7",    
             "CXCR6","ITGA1","FOXP3","IL2RA","CTLA4",   
                "GZMB","GZMK","CCL5","IFNG","CCL4",    
                "CCL3","PRF1","NKG7","CD22","TCL1A",  
           "CD83","CD38","TNFRSF17", "IGHG4","TPSAB1", 
             "TPSB2","Krt17","Krt14","Krt5","Acta2" ,  
              "Myl9","Mylk", "Myh11","Krt19", "Krt18" , 
                "Krt8","Prlr", "Cited1" "Pgr", "Prom1", 
                "Esr1","Mfge8","Trf","Csn3","Wfdc18",  
            "Elf5","Ltf", "Kit","Aldh1a3","Cd14",    
               "CD45+", "CD10+","CD31+","ACTA2","APOE", 
                "TIMP1", "TAGLN","SLPI","LTF","AGR2",   
              "KLRD1","FCGR3B","RPE65","MKI67","TOP2A", 
                "MLANA","MITF","DCT","PRAME", "GEP",   
                "CD24","ZEB2","CDH1","ESR1","VIM"     
                "FN1","ELN","COL3A1") 
genes_to_check=unique(genes_to_check)
genes_to_check = str_to_upper(genes_to_check)

library(stringr)  
p_paper_markers <- DotPlot(sce.all, features = genes_to_check,
                           assay='RNA'  )  + coord_flip()

p_paper_markers
ggsave(plot=p_paper_markers,
       filename="check_paper_marker_by_seurat_cluster.pdf",width = 12)

除了上述情况我在找乳腺上皮细胞的marker的时候看到之前单细胞天地发的 ” 乳腺上皮细胞单细胞亚群“ 的推文乳腺上皮细胞单细胞亚群 (qq.com)

Myo=c("Krt17", "Krt14", "Krt5", "Acta2", "Myl9", "Mylk", "Myh11")
Lum=c("Krt19", "Krt18", "Krt8")
Hs=c("Prlr", "Cited1", "Pgr", "Prom1", "Esr1")  
AV=c("Mfge8", "Trf", "Csn3", "Wfdc18", "Elf5", "Ltf")
Lp=c("Kit", "Aldh1a3", "Cd14")
genes_to_check = list(
  Myo=Myo,
  Lum=Lum,
  Hs=Hs, 
  AV=AV,
  Lp=Lp )

细胞生物学命名注释

# 需要自行看图，定细胞亚群： 
# 为了方便，直接 RNA_snn_res.0.1， 放弃 RNA_snn_res.0.8 
celltype=data.frame(ClusterID=0:5 ,
                    celltype= 0:5) 
#定义细胞亚群      
celltype[celltype$ClusterID %in% c( 5),2]='fibo'  
celltype[celltype$ClusterID %in% c( 3),2]='low-cell' 
celltype[celltype$ClusterID %in% c( 0,1,4),2]='epi' 
celltype[celltype$ClusterID %in% c( 2),2]='cycling cell'
head(celltype)
celltype
table(celltype$celltype)
sce.all@meta.data$celltype = "NA"

for(i in 1:nrow(celltype)){
  sce.all@meta.data[which(sce.all@meta.data$RNA_snn_res.0.1 == celltype$ClusterID[i]),'celltype'] <- celltype$celltype[i]}
table(sce.all@meta.data$celltype)

th=theme(axis.text.x = element_text(angle = 45, 
                                    vjust = 0.5, hjust=0.5)) 
 
p <- DotPlot(sce.all, features = unique(genes_to_check),
             assay='RNA' ,group.by = 'celltype' )  + coord_flip()  +th

p
ggsave(plot=p, filename="check_marker_by_celltype.pdf",
       width = 8 ,height = 10)

DotPlot(sce.all, features = genes_to_check,
        assay='RNA' ,group.by = 'celltype' ) + 
  coord_flip()+ scale_y_discrete(guide = guide_axis(n.dodge = 2)) +
  NULL

table(sce.all@meta.data$celltype,sce.all@meta.data$RNA_snn_res.0.1)
table(sce.all@meta.data$celltype,sce.all@meta.data$orig.ident)

#拼图  
library(patchwork)
p_umap=DimPlot(sce.all, reduction = "umap", group.by = "celltype",label = F)
p+p_umap
ggsave('markers_umap_by_celltype_2.pdf',width = 12,height = 10)

重新降维和细分亚群

rm(list=ls())
library(Seurat)
library(ggplot2)
library(clustree)
library(cowplot)
library(dplyr)
 getwd()
sce=readRDS('E:/sxjns/公众号文献解读/20220901_GSE196936_乳腺癌/20220901_GSE196936/2-harmony/sce.all_int.rds')
colnames(sce@meta.data)
table(sce$RNA_snn_res.0.1)

sce=sce[,sce$RNA_snn_res.0.1 %in% c(0,1,2)]
DimPlot(sce, reduction = "umap", label = T) + 
    DimPlot(sce, reduction = "umap",group.by = 'orig.ident', label = T)
pp<-FeaturePlot(sce,'EPCAM')
pp
ggsave("epcam.pdf",width = 6,height = 6)

sce
sce=CreateSeuratObject(counts = sce@assays$RNA@counts,
                       meta.data = sce@meta.data)
sce <- NormalizeData(sce, normalization.method =  "LogNormalize",  
                     scale.factor = 1e4)
GetAssay(sce,assay = "RNA")
sce <- FindVariableFeatures(sce, 
                            selection.method = "vst", nfeatures = 2000)  
sce <- ScaleData(sce) 
sce <- RunPCA(object = sce, pc.genes = VariableFeatures(sce)) 
DimHeatmap(sce, dims = 1:12, cells = 100, balanced = TRUE)
ElbowPlot(sce) 

sce <- FindNeighbors(sce, dims = 1:15)
sce <- FindClusters(sce, resolution = 0.8)
table(sce@meta.data$RNA_snn_res.0.8)  

set.seed(123)
sce <- RunTSNE(object = sce, dims = 1:15, do.fast = TRUE)
DimPlot(sce,reduction = "tsne",label=T)

sce <- RunUMAP(object = sce, dims = 1:15, do.fast = TRUE)
DimPlot(sce,reduction = "umap",label=T) 
head(sce@meta.data)
DimPlot(sce,reduction = "umap",label=T,group.by = 'orig.ident') 

# sce <- FindClusters(sce, resolution = 0.1)

getwd()
dir.create("./outputs_figure")
setwd("E:/sxjns/公众号文献解读/20220901_GSE196936_乳腺癌/20220901_GSE196936/sub_epi/outputs_figure/")
#figure正图
DimPlot(sce,reduction = "umap",label=T) 
ggsave(filename = 'umap_recluster_by_0.8.pdf',width = 8,height = 7) 
DimPlot(sce,reduction = "umap",label=T,
        group.by = 'orig.ident') 
ggsave(filename = 'umap_sce_recluster_by_orig.ident.pdf',width = 8,height = 7)

tab.1=table(sce@meta.data$seurat_clusters,sce@meta.data$orig.ident)
colnames(sce@meta.data)   
library(gplots)
tab.1 
pro='before_harmony'
pdf(file = paste0(pro,'  seurat_clusters VS  orig.ident.pdf'),
    width = 10,height  = 10)
balloonplot(tab.1, main =" seurat_clusters VS  orig.ident ", xlab ="", ylab="",
            label = T, show.margins = F)
dev.off() 

colnames(sce@meta.data)  

library(harmony)
seuratObj <- RunHarmony(sce, "orig.ident")
names(seuratObj@reductions)
seuratObj <- RunUMAP(seuratObj,  dims = 1:15, 
                     reduction = "harmony")
DimPlot(seuratObj,reduction = "umap",label=T )  


sce=seuratObj
sce <- FindNeighbors(sce, reduction = "harmony",
                     dims = 1:15)
sce <- FindClusters(sce, resolution = 0.8)
table(sce@meta.data$seurat_clusters)
DimPlot(sce,reduction = "umap",label=T) 
ggsave(filename = 'harmony_umap_recluster_by_0.8.pdf',width = 8,height = 7) 
DimPlot(sce,reduction = "umap",label=T,
        group.by = 'orig.ident') 
ggsave(filename = 'harmony_umap_sce_recluster_by_orig.ident.pdf',width = 8,height = 7) 

tab.1=table(sce@meta.data$orig.ident,sce@meta.data$seurat_clusters)
colnames(sce@meta.data)   
library(gplots)
tab.1 
pro='after_harmony'
pdf(file = paste0(pro,'  seurat_clusters VS  orig.ident.pdf'),
    width = 8,height  = 8)
balloonplot(tab.1, main =" seurat_clusters VS  orig.ident ", xlab ="", ylab="",
            label = T, show.margins = F)
dev.off() 
save(sce,file = 'first_sce.Rdata')

library(ggplot2)
Myo=c("Krt17", "Krt14", "Krt5", "Acta2", "Myl9", "Mylk", "Myh11")
Lum=c("Krt19", "Krt18", "Krt8")
Hs=c("Prlr", "Cited1", "Pgr", "Prom1", "Esr1")  
AV=c("Mfge8", "Trf", "Csn3", "Wfdc18", "Elf5", "Ltf")
Lp=c("Kit", "Aldh1a3", "Cd14")
genes_to_check = list(
  Myo=Myo,
  Lum=Lum,
  Hs=Hs, 
  AV=AV,
  Lp=Lp ) 
genes_to_check=unique(unlist(genes_to_check))
genes_to_check = str_to_upper(genes_to_check)
p1 <- DotPlot(sce, features = genes_to_check,
             assay='RNA' ,
             group.by = "RNA_snn_res.0.8" )  + coord_flip()
p1  
p2 <- DimPlot(sce,reduction = "umap",label=T,
        group.by = "RNA_snn_res.0.8" ) 
library(patchwork)
p1+p2
ggsave('harmony_res.0.1_umap_markers.pdf',width = 15,height = 10)

根据乳腺癌的Myo,Lum,Hs,Av,Lp的几种分型marker来看,每个分群都有"Krt19", "Krt18", "Krt8","ESR1"四个marker，可能细分后的这些都属于Luminal上皮细胞亚群。

针对该数据还可以继续做两个样本的差异分析，大家可以一块来试试。

两样本原代乳腺癌细胞与肺部扩散癌细胞单细胞测序数据分析 by 生信菜鸟团

这周跟大家分享一个乳腺癌两样本单细胞测序数据的背景知识和简单分析。

样本处理：

FACS：Fluorescence-activated Cell Sorting 流式细胞荧光分选技术

RFP：Red Fluorecent Protein 红色荧光蛋白

数据集及链接：GSE196936 GEO Accession viewer (nih.gov)

数据：

GSM5904917 Primary_Breast_ Tumor_Cells      #原代乳腺肿瘤细胞
GSM5904918 Lung_Disseminated_Tumor_Cells    #肺部扩散性肿瘤细胞
#这周所用的数据集是csv格式的

数据处理：

这个数据集细胞量是比较少的，可能是由于分选过？

Lung Primary 
255    3272 
过滤后
Lung Primary 
 164    2092

导入数据

rm(list=ls())
options(stringsAsFactors = F) 
library(Seurat)
library(ggplot2)
library(clustree)
library(cowplot)
library(dplyr)

###### step1:导入数据 ######   
library(data.table)
setwd("../")
getwd()
dir='./GSE196936_RAW/' 
samples=list.files( dir )
samples

library(data.table)
sceList = lapply(samples,function(pro){ 
  # pro=samples[1] 
  print(pro) 
  #ct=fread( file.path(dir,pro),data.table = F) 
  sce =CreateSeuratObject(counts =  read.csv( file.path(dir,pro)) ,
                          project =   gsub('.csv','',gsub('^GSM[0-9]*_MCF7_','',pro) ) ,
                          min.cells = 5,
                          min.features = 300 )
  return(sce)
})
sceList 
samples
# gsub('^GSM[0-9]*','',samples) 
sce.all=merge(x=sceList[[1]],
              y=sceList[ -1 ],
              add.cell.ids =   gsub('.csv','',gsub('^GSM[0-9]*_MCF7_','',samples) )  )

as.data.frame(sce.all@assays$RNA@counts[1:10, 1:2])
head(sce.all@meta.data, 10)
table(sce.all$orig.ident)

library(stringr)
sce.all$group=sce.all$orig.ident
table(sce.all@meta.data$group)
table(sce.all@meta.data$orig.ident
#rm(list=ls())
###### step2:QC质控 ######
dir.create("./1-QC")
setwd("./1-QC")
# sce.all=readRDS("../sce.all_raw.rds")
#计算线粒体基因比例
# 人和鼠的基因名字稍微不一样 
mito_genes=rownames(sce.all)[grep("^MT-", rownames(sce.all))] 
mito_genes #13个线粒体基因
sce.all=PercentageFeatureSet(sce.all, "^MT-", col.name = "percent_mito")
fivenum(sce.all@meta.data$percent_mito)
#计算核糖体基因比例
ribo_genes=rownames(sce.all)[grep("^Rp[sl]", rownames(sce.all),ignore.case = T)]
ribo_genes
sce.all=PercentageFeatureSet(sce.all, "^RP[SL]", col.name = "percent_ribo")
fivenum(sce.all@meta.data$percent_ribo)
#计算红血细胞基因比例
rownames(sce.all)[grep("^Hb[^(p)]", rownames(sce.all),ignore.case = T)]
sce.all=PercentageFeatureSet(sce.all, "^HB[^(P)]", col.name = "percent_hb")
fivenum(sce.all@meta.data$percent_hb)
#可视化细胞的上述比例情况
feats <- c("nFeature_RNA", "nCount_RNA", "percent_mito", "percent_ribo", "percent_hb")
feats <- c("nFeature_RNA", "nCount_RNA")
p1=VlnPlot(sce.all, group.by = "orig.ident", features = feats, pt.size = 0, ncol = 2) + 
  NoLegend()
p1
library(ggplot2) 
ggsave(filename="Vlnplot1.pdf",plot=p1)
feats <- c("percent_mito", "percent_ribo", "percent_hb")
p2=VlnPlot(sce.all, group.by = "orig.ident", features = feats, pt.size = 0, ncol = 3, same.y.lims=T) + 
  scale_y_continuous(breaks=seq(0, 100, 5)) +
  NoLegend()
p2 
ggsave(filename="Vlnplot2.pdf",plot=p2)

p3=FeatureScatter(sce.all, "nCount_RNA", "nFeature_RNA", group.by = "orig.ident", pt.size = 0.5)
ggsave(filename="Scatterplot.pdf",plot=p3)
#根据上述指标，过滤低质量细胞/基因
#过滤指标1:最少表达基因数的细胞&最少表达细胞数的基因
selected_c <- WhichCells(sce.all, expression = nFeature_RNA > 300)
selected_f <- rownames(sce.all)[Matrix::rowSums(sce.all@assays$RNA@counts > 0 ) > 3]

sce.all.filt <- subset(sce.all, features = selected_f, cells = selected_c)
dim(sce.all) 
dim(sce.all.filt) 
#  可以看到，主要是过滤了基因，其次才是细胞
C=sce.all.filt@assays$RNA@counts
dim(C)
C=Matrix::t(Matrix::t(C)/Matrix::colSums(C)) * 100
# 这里的C 这个矩阵，有一点大，可以考虑随抽样
C=C[,sample(1:ncol(C),1000)]
most_expressed <- order(apply(C, 1, median), decreasing = T)[50:1]
pdf("TOP50_most_expressed_gene.pdf",width=14)
boxplot(as.matrix(Matrix::t(C[most_expressed, ])),
        cex = 0.1, las = 1, 
        xlab = "% total count per cell", 
        col = (scales::hue_pal())(50)[50:1], 
        horizontal = TRUE)
dev.off()
rm(C)
#过滤指标2:线粒体/核糖体基因比例(根据上面的violin图)
selected_mito <- WhichCells(sce.all.filt, expression = percent_mito < 15)
selected_ribo <- WhichCells(sce.all.filt, expression = percent_ribo > 3)
selected_hb <- WhichCells(sce.all.filt, expression = percent_hb < 1 )
length(selected_hb)
length(selected_ribo)
length(selected_mito)
sce.all.filt <- subset(sce.all.filt, cells = selected_mito)
sce.all.filt <- subset(sce.all.filt, cells = selected_ribo)
sce.all.filt <- subset(sce.all.filt, cells = selected_hb)
dim(sce.all.filt)
table(sce.all.filt$orig.ident) 

#可视化过滤后的情况
feats <- c("nFeature_RNA", "nCount_RNA")
p1_filtered=VlnPlot(sce.all.filt, group.by = "orig.ident", features = feats, pt.size = 0, ncol = 2) + 
  NoLegend()
ggsave(filename="Vlnplot1_filtered.pdf",plot=p1_filtered)

feats <- c("percent_mito", "percent_ribo", "percent_hb")
p2_filtered=VlnPlot(sce.all.filt, group.by = "orig.ident", features = feats, pt.size = 0, ncol = 3) + 
  NoLegend()
ggsave(filename="Vlnplot2_filtered.pdf",plot=p2_filtered)
#过滤指标3:过滤特定基因
# Filter MALAT1 管家基因
sce.all.filt <- sce.all.filt[!grepl("MALAT1", rownames(sce.all.filt),ignore.case = T), ]
# Filter Mitocondrial 线粒体基因
sce.all.filt <- sce.all.filt[!grepl("^MT-", rownames(sce.all.filt),ignore.case = T), ]
# 当然，还可以过滤更多
dim(sce.all.filt) 
dim(sce.all) 
saveRDS(sce.all.filt, "sce.all_qc2_noG.rds")
#sce.all.fit 212.5MB
setwd('../')
source('step2-harmony.R')
source('step3.1-check-marker.R')

降维分群

rm(list=ls())
dir.create("./20220901_GSE196936/2-harmony")
getwd()
setwd("./20220901_GSE196936/2-harmony")
sce.all=readRDS("./1-QC/sce.all_qc2_noG.rds")
sce=sce.all
sce
sce <- NormalizeData(sce, 
                         normalization.method = "LogNormalize",
                         scale.factor = 1e4) 
sce <- FindVariableFeatures(sce)
sce <- ScaleData(sce)
sce <- RunPCA(sce, features = VariableFeatures(object = sce))

library(harmony)
seuratObj <- RunHarmony(sce, "orig.ident")
names(seuratObj@reductions)
seuratObj <- RunUMAP(seuratObj,  dims = 1:15, 
                     reduction = "harmony")
DimPlot(seuratObj,reduction = "umap",label=T ) 

sce=seuratObj
sce <- FindNeighbors(sce, reduction = "harmony",
                     dims = 1:15) 
sce.all=sce
#设置不同的分辨率，观察分群效果
for (res in c(0.01, 0.05, 0.1, 0.2, 0.3, 0.5,0.8,1)) {
  sce.all=FindClusters(sce.all, #graph.name = "CCA_snn", 
                       resolution = res, algorithm = 1)
}
colnames(sce.all@meta.data)
apply(sce.all@meta.data[,grep("RNA_snn",colnames(sce.all@meta.data))],2,table)

p1_dim=plot_grid(ncol = 3, DimPlot(sce.all, reduction = "umap", group.by = "RNA_snn_res.0.01") + 
                   ggtitle("louvain_0.01"), DimPlot(sce.all, reduction = "umap", group.by = "RNA_snn_res.0.1") + 
                   ggtitle("louvain_0.1"), DimPlot(sce.all, reduction = "umap", group.by = "RNA_snn_res.0.2") + 
                   ggtitle("louvain_0.2"))
ggsave(plot=p1_dim, filename="Dimplot_diff_resolution_low.pdf",width = 14)

p1_dim=plot_grid(ncol = 3, DimPlot(sce.all, reduction = "umap", group.by = "RNA_snn_res.0.8") + 
                   ggtitle("louvain_0.8"), DimPlot(sce.all, reduction = "umap", group.by = "RNA_snn_res.1") + 
                   ggtitle("louvain_1"), DimPlot(sce.all, reduction = "umap", group.by = "RNA_snn_res.0.3") + 
                   ggtitle("louvain_0.3"))
ggsave(plot=p1_dim, filename="Dimplot_diff_resolution_high.pdf",width = 18)

p2_tree=clustree(sce.all@meta.data, prefix = "RNA_snn_res.")
ggsave(plot=p2_tree, filename="Tree_diff_resolution.pdf")

#接下来分析，按照分辨率为0.1进行 
sel.clust = "RNA_snn_res.0.1"
sce.all <- SetIdent(sce.all, value = sel.clust)
table(sce.all@active.ident) 
saveRDS(sce.all, "sce.all_int.rds")
sce.all=readRDS("./2-harmony/sce.all_int.rds")

检查常见分群情况

###### step5:检查常见分群情况  ######
setwd('../')
dir.create("3-cell")
setwd("3-cell")  
getwd()
DimPlot(sce.all, reduction = "umap", group.by = "seurat_clusters",label = T) 
DimPlot(sce.all, reduction = "umap", group.by = "RNA_snn_res.0.1",label = T) 
ggsave('umap_by_RNA_snn_res.0.1.pdf')

library(stringr)
#刚开始只找了一些基础marker分不太开
#genes_to_check = c('PTPRC', 'CD3D', 'CD3E', 'CD8A',
             # 'CD19', 'CD79A', 'MS4A1' , 'IGHG1', 
             # 'MZB1', 'SDC1',
                #   'CD68', 'CD163', 'CD14', 
                  # 'RCVRN',
                  # 'FGF7','MME',
                  # 'PECAM1', 'VWF', 
                #'EPCAM' , 'KRT19', 'PROM1', 'ALDH1A1')

#genes_to_check=unique(genes_to_check)
#genes_to_check = str_to_upper(genes_to_check)
#因此又多找了一些marker
genes_to_check = ("PTPRC","CD3D","CD3E","CD8A","CD19",
              "CD79A","MS4A1","IGHG1","MZB1","SDC1",
              "CD68","CD163","CD14","RCVRN","FGF7",    
              "MME", "PECAM1","VWF","EPCAM","KRT19",   
          "PROM1","ALDH1A1","AGER","SFTPA1","SCGB1A1", 
                "KRT17","TPPP3","KRT4","KRT14","KRT8", 
                "KRT18","CD4","CCR7","SELL","TCF7",    
             "CXCR6","ITGA1","FOXP3","IL2RA","CTLA4",   
                "GZMB","GZMK","CCL5","IFNG","CCL4",    
                "CCL3","PRF1","NKG7","CD22","TCL1A",  
           "CD83","CD38","TNFRSF17", "IGHG4","TPSAB1", 
             "TPSB2","Krt17","Krt14","Krt5","Acta2" ,  
              "Myl9","Mylk", "Myh11","Krt19", "Krt18" , 
                "Krt8","Prlr", "Cited1" "Pgr", "Prom1", 
                "Esr1","Mfge8","Trf","Csn3","Wfdc18",  
            "Elf5","Ltf", "Kit","Aldh1a3","Cd14",    
               "CD45+", "CD10+","CD31+","ACTA2","APOE", 
                "TIMP1", "TAGLN","SLPI","LTF","AGR2",   
              "KLRD1","FCGR3B","RPE65","MKI67","TOP2A", 
                "MLANA","MITF","DCT","PRAME", "GEP",   
                "CD24","ZEB2","CDH1","ESR1","VIM"     
                "FN1","ELN","COL3A1") 
genes_to_check=unique(genes_to_check)
genes_to_check = str_to_upper(genes_to_check)

library(stringr)  
p_paper_markers <- DotPlot(sce.all, features = genes_to_check,
                           assay='RNA'  )  + coord_flip()

p_paper_markers
ggsave(plot=p_paper_markers,
       filename="check_paper_marker_by_seurat_cluster.pdf",width = 12)

除了上述情况我在找乳腺上皮细胞的marker的时候看到之前单细胞天地发的 ” 乳腺上皮细胞单细胞亚群“ 的推文乳腺上皮细胞单细胞亚群 (qq.com)

Myo=c("Krt17", "Krt14", "Krt5", "Acta2", "Myl9", "Mylk", "Myh11")
Lum=c("Krt19", "Krt18", "Krt8")
Hs=c("Prlr", "Cited1", "Pgr", "Prom1", "Esr1")  
AV=c("Mfge8", "Trf", "Csn3", "Wfdc18", "Elf5", "Ltf")
Lp=c("Kit", "Aldh1a3", "Cd14")
genes_to_check = list(
  Myo=Myo,
  Lum=Lum,
  Hs=Hs, 
  AV=AV,
  Lp=Lp )

细胞生物学命名注释

# 需要自行看图，定细胞亚群： 
# 为了方便，直接 RNA_snn_res.0.1， 放弃 RNA_snn_res.0.8 
celltype=data.frame(ClusterID=0:5 ,
                    celltype= 0:5) 
#定义细胞亚群      
celltype[celltype$ClusterID %in% c( 5),2]='fibo'  
celltype[celltype$ClusterID %in% c( 3),2]='low-cell' 
celltype[celltype$ClusterID %in% c( 0,1,4),2]='epi' 
celltype[celltype$ClusterID %in% c( 2),2]='cycling cell'
head(celltype)
celltype
table(celltype$celltype)
sce.all@meta.data$celltype = "NA"

for(i in 1:nrow(celltype)){
  sce.all@meta.data[which(sce.all@meta.data$RNA_snn_res.0.1 == celltype$ClusterID[i]),'celltype'] <- celltype$celltype[i]}
table(sce.all@meta.data$celltype)

th=theme(axis.text.x = element_text(angle = 45, 
                                    vjust = 0.5, hjust=0.5)) 
 
p <- DotPlot(sce.all, features = unique(genes_to_check),
             assay='RNA' ,group.by = 'celltype' )  + coord_flip()  +th

p
ggsave(plot=p, filename="check_marker_by_celltype.pdf",
       width = 8 ,height = 10)

DotPlot(sce.all, features = genes_to_check,
        assay='RNA' ,group.by = 'celltype' ) + 
  coord_flip()+ scale_y_discrete(guide = guide_axis(n.dodge = 2)) +
  NULL

table(sce.all@meta.data$celltype,sce.all@meta.data$RNA_snn_res.0.1)
table(sce.all@meta.data$celltype,sce.all@meta.data$orig.ident)

#拼图  
library(patchwork)
p_umap=DimPlot(sce.all, reduction = "umap", group.by = "celltype",label = F)
p+p_umap
ggsave('markers_umap_by_celltype_2.pdf',width = 12,height = 10)

重新降维和细分亚群

rm(list=ls())
library(Seurat)
library(ggplot2)
library(clustree)
library(cowplot)
library(dplyr)
 getwd()
sce=readRDS('E:/sxjns/公众号文献解读/20220901_GSE196936_乳腺癌/20220901_GSE196936/2-harmony/sce.all_int.rds')
colnames(sce@meta.data)
table(sce$RNA_snn_res.0.1)

sce=sce[,sce$RNA_snn_res.0.1 %in% c(0,1,2)]
DimPlot(sce, reduction = "umap", label = T) + 
    DimPlot(sce, reduction = "umap",group.by = 'orig.ident', label = T)
pp<-FeaturePlot(sce,'EPCAM')
pp
ggsave("epcam.pdf",width = 6,height = 6)

sce
sce=CreateSeuratObject(counts = sce@assays$RNA@counts,
                       meta.data = sce@meta.data)
sce <- NormalizeData(sce, normalization.method =  "LogNormalize",  
                     scale.factor = 1e4)
GetAssay(sce,assay = "RNA")
sce <- FindVariableFeatures(sce, 
                            selection.method = "vst", nfeatures = 2000)  
sce <- ScaleData(sce) 
sce <- RunPCA(object = sce, pc.genes = VariableFeatures(sce)) 
DimHeatmap(sce, dims = 1:12, cells = 100, balanced = TRUE)
ElbowPlot(sce) 

sce <- FindNeighbors(sce, dims = 1:15)
sce <- FindClusters(sce, resolution = 0.8)
table(sce@meta.data$RNA_snn_res.0.8)  

set.seed(123)
sce <- RunTSNE(object = sce, dims = 1:15, do.fast = TRUE)
DimPlot(sce,reduction = "tsne",label=T)

sce <- RunUMAP(object = sce, dims = 1:15, do.fast = TRUE)
DimPlot(sce,reduction = "umap",label=T) 
head(sce@meta.data)
DimPlot(sce,reduction = "umap",label=T,group.by = 'orig.ident') 

# sce <- FindClusters(sce, resolution = 0.1)

getwd()
dir.create("./outputs_figure")
setwd("E:/sxjns/公众号文献解读/20220901_GSE196936_乳腺癌/20220901_GSE196936/sub_epi/outputs_figure/")
#figure正图
DimPlot(sce,reduction = "umap",label=T) 
ggsave(filename = 'umap_recluster_by_0.8.pdf',width = 8,height = 7) 
DimPlot(sce,reduction = "umap",label=T,
        group.by = 'orig.ident') 
ggsave(filename = 'umap_sce_recluster_by_orig.ident.pdf',width = 8,height = 7)

tab.1=table(sce@meta.data$seurat_clusters,sce@meta.data$orig.ident)
colnames(sce@meta.data)   
library(gplots)
tab.1 
pro='before_harmony'
pdf(file = paste0(pro,'  seurat_clusters VS  orig.ident.pdf'),
    width = 10,height  = 10)
balloonplot(tab.1, main =" seurat_clusters VS  orig.ident ", xlab ="", ylab="",
            label = T, show.margins = F)
dev.off() 

colnames(sce@meta.data)  

library(harmony)
seuratObj <- RunHarmony(sce, "orig.ident")
names(seuratObj@reductions)
seuratObj <- RunUMAP(seuratObj,  dims = 1:15, 
                     reduction = "harmony")
DimPlot(seuratObj,reduction = "umap",label=T )  


sce=seuratObj
sce <- FindNeighbors(sce, reduction = "harmony",
                     dims = 1:15)
sce <- FindClusters(sce, resolution = 0.8)
table(sce@meta.data$seurat_clusters)
DimPlot(sce,reduction = "umap",label=T) 
ggsave(filename = 'harmony_umap_recluster_by_0.8.pdf',width = 8,height = 7) 
DimPlot(sce,reduction = "umap",label=T,
        group.by = 'orig.ident') 
ggsave(filename = 'harmony_umap_sce_recluster_by_orig.ident.pdf',width = 8,height = 7) 

tab.1=table(sce@meta.data$orig.ident,sce@meta.data$seurat_clusters)
colnames(sce@meta.data)   
library(gplots)
tab.1 
pro='after_harmony'
pdf(file = paste0(pro,'  seurat_clusters VS  orig.ident.pdf'),
    width = 8,height  = 8)
balloonplot(tab.1, main =" seurat_clusters VS  orig.ident ", xlab ="", ylab="",
            label = T, show.margins = F)
dev.off() 
save(sce,file = 'first_sce.Rdata')

library(ggplot2)
Myo=c("Krt17", "Krt14", "Krt5", "Acta2", "Myl9", "Mylk", "Myh11")
Lum=c("Krt19", "Krt18", "Krt8")
Hs=c("Prlr", "Cited1", "Pgr", "Prom1", "Esr1")  
AV=c("Mfge8", "Trf", "Csn3", "Wfdc18", "Elf5", "Ltf")
Lp=c("Kit", "Aldh1a3", "Cd14")
genes_to_check = list(
  Myo=Myo,
  Lum=Lum,
  Hs=Hs, 
  AV=AV,
  Lp=Lp ) 
genes_to_check=unique(unlist(genes_to_check))
genes_to_check = str_to_upper(genes_to_check)
p1 <- DotPlot(sce, features = genes_to_check,
             assay='RNA' ,
             group.by = "RNA_snn_res.0.8" )  + coord_flip()
p1  
p2 <- DimPlot(sce,reduction = "umap",label=T,
        group.by = "RNA_snn_res.0.8" ) 
library(patchwork)
p1+p2
ggsave('harmony_res.0.1_umap_markers.pdf',width = 15,height = 10)

根据乳腺癌的Myo,Lum,Hs,Av,Lp的几种分型marker来看,每个分群都有"Krt19", "Krt18", "Krt8","ESR1"四个marker，可能细分后的这些都属于Luminal上皮细胞亚群。

针对该数据还可以继续做两个样本的差异分析，大家可以一块来试试。

ixxmu / mp_duty

两样本原代乳腺癌细胞与肺部扩散癌细胞单细胞测序数据分析 #2718

两样本原代乳腺癌细胞与肺部扩散癌细胞单细胞测序数据分析 by 生信菜鸟团

两样本原代乳腺癌细胞与肺部扩散癌细胞单细胞测序数据分析 by 生信菜鸟团