Closed ixxmu closed 1 year ago
“ 本系列涉及单细胞数据的前期数据处理,以及后续的可视化。把整个流程都打通。”
本系列月更,写满12篇。第一篇主要是单细胞下游分析的全流程,后续11篇会着重写实践中常见的画图需求。
全文分为三部分
01:单细胞降维聚类分群
02:单细胞celltypist自动命名,省去人工注释烦恼
03:单细胞曼哈顿图,批量差异分析、富集分析
01
—
1.降维聚类分群
第一部分的正文内容从这里开始。
首先从geo下载数据,并解压拿到rawdata
#设置工作路径
path="/home/data/t040413/ipf/GSE104154_scRNA-seq_fibrotic MC_bleomycin"
dir.create(path)
setwd(path)
getwd()
list.files()
#read表达矩阵
raw_counts=read.csv("/home/data/t040413/ipf/GSE104154_scRNA-seq_fibrotic MC_bleomycin/GSE104154_d0_d21_sma_tm_Expr_raw/GSE104154_d0_d21_sma_tm_Expr_raw.csv"
)
head(raw_counts)[1:4,1:4]
counts=raw_counts[,-1]
head(counts)[1:4,1:4]
rownames(counts)=counts$symbol
head(raw_counts)[1:4,1:4]
注意:这个数据使用了ensemble id作为基因名,后续需要换为gene symbol
head(raw_counts)[1:4,1:4]
counts=raw_counts[,-2]
head(counts)[1:4,1:4]
rownames(counts)=counts$id
head(counts)[,1:5]
counts=counts[,-1]
head(counts)[,1:5]
创建seurat对象
library(Seurat)
#https://zhuanlan.zhihu.com/p/385206713
rawdata=CreateSeuratObject(counts = counts,project = "blem",assay = "RNA")
这里的seurat对象行名是ensemble id,改为gene symbol 比较容易看,如何改呢?
#去重 取得唯一基因
ids=raw_counts[,1:2]
head(ids)
colnames(ids)= c('ENSEMBL','SYMBOL')
head(ids)
dim(ids) # [1] 16428
ids=na.omit(ids)
dim(ids) # [1] 15504
length(unique(ids$SYMBOL)) # [1] 15494
# 这里的关系超级乱,互相之间都不是一对一
# 凡是混乱的ID一律删除即???
ids=ids[!duplicated(ids$SYMBOL),]
ids=ids[!duplicated(ids$ENSEMBL),]
dim(ids)
pos=match(ids$ENSEMBL,rownames(rawdata) )
hp_sce=rawdata[pos,]
hp_sce
把seurat对象中的ensemble id 改为gene symble
#创建函数 改名
RenameGenesSeurat <- function(obj ,
newnames ) {
# Replace gene names in different slots of a Seurat object. Run this before integration. Run this before integration.
# It only changes obj@assays$RNA@counts, @data and @scale.data.
print("Run this before integration. It only changes obj@assays$RNA@counts, @data and @scale.data.")
RNA <- obj@assays$RNA
if (nrow(RNA) == length(newnames)) {
if (length(RNA@counts)) RNA@counts@Dimnames[[1]] <- newnames
if (length(RNA@data)) RNA@data@Dimnames[[1]] <- newnames
if (length(RNA@scale.data)) RNA@scale.data@Dimnames[[1]] <- newnames
} else {"Unequal gene sets: nrow(RNA) != nrow(newnames)"}
obj@assays$RNA <- RNA
return(obj)
}
hp_sce=RenameGenesSeurat(obj = hp_sce,
newnames = ids$SYMBOL)
getwd()
#save(hp_sce,file = 'first_sce.Rdata')
hp_sce
改名成功
rownames(hp_sce)[grepl('^mt-',rownames(hp_sce))]
rownames(hp_sce)[grepl('^Rp[sl]',rownames(hp_sce))]
hp_sce[["percent.mt"]] <- PercentageFeatureSet(hp_sce, pattern = "^mt-")
fivenum(hp_sce[["percent.mt"]][,1])
rb.genes <- rownames(hp_sce)[grep("^Rp[sl]",rownames(hp_sce))]
C<-GetAssayData(object = hp_sce, slot = "counts")
percent.ribo <- Matrix::colSums(C[rb.genes,])/Matrix::colSums(C)*100
hp_sce <- AddMetaData(hp_sce, percent.ribo, col.name = "percent.ribo")
getwd()
plot1 <- FeatureScatter(hp_sce, feature1 = "nCount_RNA", feature2 = "percent.mt")
plot2 <- FeatureScatter(hp_sce, feature1 = "nCount_RNA", feature2 = "nFeature_RNA")
CombinePlots(plots = list(plot1, plot2))
查看质控
rownames(hp_sce)[grepl('^mt-',rownames(hp_sce))]
rownames(hp_sce)[grepl('^Rp[sl]',rownames(hp_sce))]
hp_sce[["percent.mt"]] <- PercentageFeatureSet(hp_sce, pattern = "^mt-")
fivenum(hp_sce[["percent.mt"]][,1])
rb.genes <- rownames(hp_sce)[grep("^Rp[sl]",rownames(hp_sce))]
C<-GetAssayData(object = hp_sce, slot = "counts")
percent.ribo <- Matrix::colSums(C[rb.genes,])/Matrix::colSums(C)*100
hp_sce <- AddMetaData(hp_sce, percent.ribo, col.name = "percent.ribo")
getwd()
plot1 <- FeatureScatter(hp_sce, feature1 = "nCount_RNA", feature2 = "percent.mt")
plot2 <- FeatureScatter(hp_sce, feature1 = "nCount_RNA", feature2 = "nFeature_RNA")
CombinePlots(plots = list(plot1, plot2))
VlnPlot(hp_sce, features = c("percent.ribo", "percent.mt"), ncol = 2)
VlnPlot(hp_sce, features = c("nFeature_RNA", "nCount_RNA"), ncol = 2)
VlnPlot(hp_sce, features = c("percent.ribo", "nCount_RNA"), ncol = 2)
hp_sce
#为了演示celltypeist,把物种改为human的gene symbol
hp_sce=RenameGenesSeurat(obj = hp_sce,
newnames = toupper(rownames(hp_sce)))
hp_sce1 <- subset(hp_sce, subset = nFeature_RNA > 200 & nCount_RNA > 1000 & percent.mt < 20)
hp_sce1
设置组别
sce=hp_sce1
sce
colnames(sce)
grep(colnames(sce),pattern = ".1")
grep(colnames(sce),pattern = ".2")
sce@meta.data$stim <-c(rep("PBS", length(grep("1$", sce@assays$RNA@counts@Dimnames[[2]]))),
rep("PBS", length(grep("2$", sce@assays$RNA@counts@Dimnames[[2]]))),
rep("PBS", length(grep("3$", sce@assays$RNA@counts@Dimnames[[2]]))),
rep("Bleomycin", length(grep("4$", sce@assays$RNA@counts@Dimnames[[2]]))),
rep("Bleomycin", length(grep("5$", sce@assays$RNA@counts@Dimnames[[2]]))),
rep("Bleomycin", length(grep("6$", sce@assays$RNA@counts@Dimnames[[2]])))
) ## 8186,7947;
table(sce$stim)
library(dplyr)
sce[["RNA"]]@meta.features <- data.frame(row.names = rownames(sce[["RNA"]]))
seurat标准流程 :
All = sce%>%Seurat::NormalizeData(verbose = FALSE) %>%
FindVariableFeatures(selection.method = "vst", nfeatures = 2000) %>%
ScaleData(verbose = FALSE)
All = RunPCA(All, npcs = 50, verbose = FALSE)
pdf("2_ElbowPlot.pdf")
ElbowPlot(All, ndims = 50)
dev.off()
使用harmony降维聚类分群
library(cowplot)
#All@meta.data$stim <- c(rep("case", length(grep("1$", All@assays$RNA@counts@Dimnames[[2]]))), rep("ctrl", length(grep("2$", All@assays$RNA@counts@Dimnames[[2]])))) ## 8186,7947;
pdf("2_pre_harmony_harmony_plot.pdf")
options(repr.plot.height = 5, repr.plot.width = 12)
p1 <- DimPlot(object = All, reduction = "pca", pt.size = .1, group.by = "stim")
p2 <- VlnPlot(object = All, features = "PC_1", group.by = "stim", pt.size = .1)
plot_grid(p1, p2)
dev.off()
##########################run harmony
library(harmony)
All <- All %>% RunHarmony("stim", plot_convergence = TRUE)
harmony_embeddings <- Embeddings(All, 'harmony')
pdf("2_after_harmony_harmony_plot.pdf")
options(repr.plot.height = 5, repr.plot.width = 12)
p3 <- DimPlot(object = All, reduction = "harmony", pt.size = .1, group.by = "stim")
p4 <- VlnPlot(object = All, features = "harmony_1", group.by = "stim", pt.size = .1)
plot_grid(p3, p4)
dev.off()
#############cluster
library(harmony)
All <- All %>%
RunUMAP(reduction = "harmony", dims = 1:30) %>%
RunTSNE(reduction = "harmony", dims = 1:30) %>%
FindNeighbors(reduction = "harmony", dims = 1:30)
All<-All%>% FindClusters(resolution = 3) %>% identity()
保存 重新加载
#save(All,file ="/home/data/t040413/ipf/GSE104154_scRNA-seq_fibrotic MC_bleomycin/All_for_clustering.rds" )
load("/home/data/t040413/ipf/GSE104154_scRNA-seq_fibrotic MC_bleomycin/All_for_clustering.rds")
计算marker基因
library(Seurat)
DimPlot(All,label = T,reduction = 'tsne')
DimPlot(All,label = T,reduction = 'umap',group.by = 'stim')
getwd()
Disease.markers <- FindAllMarkers(All, min.pct = 0.35, logfc.threshold = 0.35, only.pos = T)
openxlsx::write.xlsx(Disease.markers,file ="G:/silicosis/geo/GSE104154_scRNA-seq_fibrotic MC_bleomycin/markers_for_all.xlsx" )
“ 分群之后,如何给细胞命名呢,可以使用celltypist自动命名,省去人工烦恼。”
02
—
2.分群之后,如何给细胞命名呢,可以使用celltypist自动命名,省去人工烦恼
此部分的正文内容从这里开始。
#https://mp.weixin.qq.com/s/CdhAp7lO5nRJW4cnLWGwLQ
.libPaths(c("/home/data/t040413/R/x86_64-pc-linux-gnu-library/4.2","/home/data/t040413/R/yll/usr/local/lib/R/site-library", "/usr/local/lib/R/library"))
library(Seurat)
load("/home/data/t040413/ipf/GSE104154_scRNA-seq_fibrotic MC_bleomycin/All_for_clustering.rds")
toupper()
#为了演示celltypeist,把物种改为human的gene symbol
All.merge=All
DimPlot(All.merge,label = T)
#在r中创建如下环境========================================
#conda create -n celltypist python=3.8
#conda activate celltypist
## 两种方法安装 pip or conda
#pip install celltypist -i https://pypi.douban.com/simple/ --use-pep517
#mamba install -c bioconda -c conda-forge celltypist
#使用use_condaenv()函数仅在当前R会话中激活指定的Conda环境。如果您希望在多个会话中使用相同的Conda环境,需要在每个会话中都执行相应的use_condaenv()函数调用。
#如果您使用的是conda而不是mamba,请将上述的mamba命令替换为conda命令进行安装。
#use_condaenv("celltypist") #如果您的Conda环境在默认路径中,您可以省略路径参数,仅提供环境的名称
library(reticulate)
use_condaenv("/home/data/t040413/anaconda3/envs/celltypist")#celltypist应该是环境的名称,而不是环境文件夹的名称。如果您的Conda环境文件夹名为celltypist,则上述代码应该可以正确激活环境。
## 载入python模块
scanpy = import("scanpy")
celltypist = import("celltypist")
pandas <- import("pandas")
numpy = import("numpy")
#### 下载参考数据集
celltypist$models$download_models(force_update = T)
library(Seurat)
#library(SeuratData)
library(ggplot2)
library(patchwork)
library(dplyr)
library(stringr)
library(readr)
library(cowplot)
library(reticulate)
1#Step1. 加载上述下载好的参考数据集
# 首先确认用户存放.pkl文件的位置,默认在
celltypist$models$celltypist_path # [1] "/home/data/t040413/.celltypist"
# 加载所有的参考数据集
model_type = list.files("~/.celltypist/data/models/")
names(model_type) = str_split(string = model_type,pattern = "\\.", simplify = T)[,1]
model_type
model_type[grep(pattern = "Mouse",model_type)]
names(model_type)
1.1#根据需要选择,,,,,这里我只选2个模型
model_type=model_type[c("Human_Lung_Atlas","Human_IPF_Lung")]
model_type
1.2#这个model_list非常重要,用于最后不同模型的可视化
model_list = lapply(model_type, function(x){
celltypist$models$Model$load(model = x)})
model_list
2.#加载数据
#load("/home/data/t040413/ipf/gse132771_colgen_fibroblasts/step1_get_matrix_ipf/All.merge.rds")
ifnb.data=All.merge
seurat.data=All.merge
2.1 #Seurat对象转为celltypist所需要的对象:
adata = scanpy$AnnData(X = numpy$array(as.matrix(t(as.matrix(ifnb.data[['RNA']]@counts)))),
obs = pandas$DataFrame(ifnb.data@meta.data),
var = pandas$DataFrame(data.frame(gene = rownames(ifnb.data[['RNA']]@counts),
row.names = rownames(ifnb.data[['RNA']]@counts)))
)
adata_copy <- adata$copy() #in order to edit array or otherwise Error: ValueError: output array is read-only
scanpy$pp$normalize_total(adata_copy, target_sum = 1e4)
scanpy$pp$log1p(adata_copy)
adata=adata_copy
3.# 细胞亚群预测和可视化
# Add cluster information to adata
adata$obs$seurat_clusters <- ifnb.data$seurat_clusters
# Update the neighborhood graph using cluster information
#scanpy::tl.louvain(adata, key_added = "seurat_clusters")
3.1#根据上面对各个参考数据集的介绍,我这里先用Human_IPF_Lung参考数据集预测看看:
### 1. Human_IPF_Lung
predictions = celltypist$annotate(adata, model = model_list[["Human_IPF_Lung"]], majority_voting = T)
## 把这些信息加入到seurat对象中去
seurat.data = AddMetaData(seurat.data, predictions$predicted_labels$majority_voting, col.name ="Human_IPF_Lung")
### 2. Human_Lung_Atlas
predictions = celltypist$annotate(adata, model = model_list[["Human_Lung_Atlas"]], majority_voting = T)
## 把这些信息加入到seurat对象中去
seurat.data = AddMetaData(seurat.data, predictions$predicted_labels$majority_voting, col.name ="Human_Lung_Atlas")
head(ifnb.data@meta.data)
head(predictions$predicted_labels)
table(predictions$predicted_labels$majority_voting)
table(predictions$predicted_labels$predicted_labels)
3.2### 3. 可视化
p0 = DimPlot(seurat.data , reduction = "umap",label = T,label.box = T,group.by = "seurat_clusters")+ NoLegend();p0
p1 = DimPlot(seurat.data ,group.by = "Human_IPF_Lung", reduction = "umap", label = TRUE)
p2 = DimPlot(seurat.data ,group.by = "Human_Lung_Atlas", reduction = "umap", label = TRUE,repel = T)
p0 + p2
pdf("celltypist.pdf",height=10,width=15)
print(p0 + p2 )
dev.off()
head(seurat.data@meta.data)
4.# 写函数批量运行
celltypist_vis = function(adata,
seurat_Data,
model = Immune_All_Low.pkl,
title = "Immune_All_Low"){
### 4.开始预测
predictions = celltypist$annotate(adata, model = model, majority_voting = T)
# predictions$predicted_labels %>% head()
#model=model_type[1]
#### 5.把这些信息加入到seurat对象中去
print(paste("运行到这里了__1:",names(model)))
seurat_Data = AddMetaData(seurat_Data , predictions$predicted_labels)
# model=model_list[[1]]
# model$load(model = model)
#names(model)
print(paste("运行到这里了__2:",names(model), paste(names(model),"_celltype_labels")))
head(seurat_Data )
seurat_Data = SetIdent(seurat_Data , value = "predicted_labels") #"majority_voting"
p.umap = DimPlot(seurat_Data, reduction = "umap", label = TRUE, pt.size = 0.5,label.box = T,repel = T) +
NoLegend() + ggtitle(title)
return(p.umap)
}
### 批量预测
plot.list = list()
for (i in 1:length(model_list)) {
plot.list[[i]] = celltypist_vis(adata = adata,
seurat_Data = ifnb.data, ###输入自己的
model = model_list[[i]],
title = names(model_list[i]) )
print(paste0("done==plot==",names(model_list)))
}
p.ct = wrap_plots(plot.list,ncol = 4) + p0
p.ct
ggsave(p.ct, filename = "./annotation_celltypist---.jpeg",limitsize = FALSE,
width = 60,height = 20)
这样就可以依据上面的细胞类型来命名了
#细胞类型命名,这里只是举例子,不是实际的数据
All.merge=RenameIdents(All.merge,
"1"="SMC","2"="SMC",
"7"="Pericyte",
"13"="Myofibroblast","14"="Myofibroblast",
"4"="Myofibroblast","10"="Myofibroblast","0"="Myofibroblast",
"5"="Fibroblast",
"12"="AT2",
"15"="AT1",
"21"="Mesothelial",
"20"="Goblet",
"19"="Ciliated",
"16"="B Plasma",
"22"="Mast",
"9"="T/NK",
"17"="B cells",
"24"="Plasmacytoid DCs",
"23"="Macrophage",
"3"="Macrophage",
"11"="DCs","18"="DCs",
"8"="Monocyte",
"6"="Endothelial",
"25"="Endothelial"
)
“ 下面部分着重一些常规的画图。”
03
—
批量差异分析 画图 groupgo批量富集分析
此部分的正文内容从这里开始。
1.首先来画曼哈顿图
数据准备:
pbmc$celltype.group=pbmc$celltype.stim
pbmc$celltype=pbmc$seurat_clusters
Idents(pbmc)=pbmc$celltype.group
cellfordeg<-levels(pbmc$celltype)
data_forplot=data.frame()
for(i in 1:length(cellfordeg)){ #
CELLDEG <- FindMarkers(pbmc, ident.1 = paste0(cellfordeg[i],"_PT"), ident.2 = paste0(cellfordeg[i],"_sham"), verbose = FALSE)
CELLDEG$gene=rownames(CELLDEG)
CELLDEG$cluster=cellfordeg[i]
write.csv(CELLDEG,paste0(cellfordeg[i],"PT_VS_Sham.CSV"))
data_forplot=rbind(CELLDEG,data_forplot)
}
head(data_forplot)
.libPaths(c( "/home/data/t040413/R/x86_64-pc-linux-gnu-library/4.2","/home/data/t040413/R/yll/usr/local/lib/R/site-library", "/home/data/refdir/Rlib/", "/usr/local/lib/R/library"))
library(Seurat)
library(scRNAtoolVis)
library(colourpicker)
markers_for_all_3groups=data_forplot
head(markers_for_all_3groups)
getwd()
# plot
jjVolcano(diffData = markers_for_all_3groups,
legend.position = c(0.93, 0.99),
topGeneN=2,#top genes to be labeled in plot, default 5.
cluster.order=seq(0,23,1),
pSize=0.4,
tile.col = c("#EE3B3B", "#FF7F00", "#CD6600", "#8B2323", "#DEB887", "#76EEC6",
"#F0FFFF", "#008B8B", "#FFB90F", "#F5F5DC", "#1F1F1F", "#66CD00",
"#0000FF", "#97FFFF", "#528B8B", "#9400D3", "#EE1289", "#00BFFF",
"#00FF00", "#191970", "#FFFF00", "#4A708B", "#00FF7F", "#8B8B00",
"#FF1493", "#FFA500", "#8B4513"))
2.画各种基础的图 featureplot dimplot
数据处理
#request 2
.libPaths(c( "/home/data/t040413/R/x86_64-pc-linux-gnu-library/4.2","/home/data/t040413/R/yll/usr/local/lib/R/site-library", "/home/data/refdir/Rlib/", "/usr/local/lib/R/library"))
library(Seurat)
getwd()
dir.create("/home/data/t040413/ipf/GSE104154_scRNA-seq_fibrotic MC_bleomycin/enrichments")
setwd("/home/data/t040413/ipf/GSE104154_scRNA-seq_fibrotic MC_bleomycin/enrichments")
getwd()
library(dplyr)
load("~/silicosis/silicosis_cluster_merge.rds")
All.merge$Cell_subtype=All.merge$cell.type
table(Idents(All.merge))
subset_data=All.merge
##############################################dimp plot
if(TRUE){
pdf("2__all_celltype_tsne.pdf",width = 8,height = 6)
p=DimPlot(subset_data,label = T,group.by = "Cell_subtype",reduction = "tsne",raster=FALSE,repel = T)
print(p)
dev.off()
pdf("2__all_cellsubtype_tsne_split.pdf",width = 18,height = 6)
p=DimPlot(subset_data,label = T,group.by = "Cell_subtype",split.by = "stim",reduction = "tsne",raster=FALSE,repel = T)
print(p)
dev.off()
pdf("2_all_cellsubtype_umap.pdf",width = 8,height = 6)
p=DimPlot(subset_data,label = T,group.by = "Cell_subtype",raster=FALSE,repel = T)
print(p)
dev.off()
pdf("2_all_cellsubtype_umap_split.pdf",width = 18,height = 6)
p=DimPlot(subset_data,label = T,group.by = "Cell_subtype",split.by = "stim",raster=FALSE,repel = T,reduction = "umap")
print(p)
dev.off()
}
展示其中一张
2.2 heatmap
markers=FindAllMarkers(subset_data,only.pos = T,logfc.threshold = 1.1)
head(markers)
markers$gene=rownames(markers)
write.csv(markers,file = "markers.csv")
openxlsx::write.xlsx(markers,file = "markers.xlsx")
getwd()
markers %>%
group_by(cluster) %>%
top_n(n = 20, wt = avg_log2FC) -> top20
DoHeatmap(subset_data, features = top20$gene) #+ NoLegend()
pdf("top20_heatmap_celltypes_Legend__order_by_averageExpression.pdf")
p=DoHeatmap(subset_data, features = top20$gene) #+ # scale_color_discrete(name = "Identity", labels = rep("",13))
print(p)
dev.off()
head(markers_for_Three_cd8subset)
#--------------------------------------------------
markers%>%
group_by(cluster) %>%
top_n(n = 5, wt = -p_val_adj) -> top5
DoHeatmap(subset_data, features = top5$gene) #+ NoLegend()
pdf("top5_heatmap_celltypes_Legend__order_by_p_val_adj.pdf")
p=DoHeatmap(subset_data, features = top5$gene) #+ # scale_color_discrete(name = "Identity", labels = rep("",13))
print(p)
dev.off()
3.如何利用单细胞数据画下面的富集分析图
3.1首先进行批量差异分析
输入数据要求
#--------------------------------------=======================--------批量差异分析差异分析
DimPlot(subset_data,label = T)
table(Idents(subset_data))
head(subset_data@meta.data)
all_degs=data.frame()
gsub(pattern = "/",replacement = "_",x = "yofibroblast/vascular smooth muscle cell degs.csv")
for (eachgroup in levels(subset_data$Cell_subtype) ) {
# # eachgroup="nLung"
# subset_data2=subset_data
# Idents(subset_data2)=subset_data2$stim
# subset_data2=subset(subset_data2,idents=eachgroup)
#
# Idents(subset_data2)=subset_data2$Cell_subtype
# table(Idents(subset_data2))
# degs=FindMarkers(subset_data2,ident.1 = "SiO2_56",ident.2 = "NS_56")
subset_data2=subset(subset_data,idents = eachgroup)
Idents(subset_data2)=subset_data2$stim
degs=FindMarkers(subset_data2,ident.1 = "SiO2_56",ident.2 = "NS_56")
degs$gene=rownames(degs)
degs$group=eachgroup
#防止类似文件名称引起的报错 Myofibroblast/vascular smooth muscle cell degs.csv': No such file or directory
write.csv(degs,file = paste(gsub(pattern = "/","_",x = eachgroup),"degs.csv"))
print(paste0(eachgroup,"__done"))
all_degs=rbind(degs,all_degs)
}
head(all_degs)
dim(all_degs)
write.csv(all_degs,file = "./all_degs.csv")
3.2 使用上述的all_degs进行分组批量富集分析析
DimPlot(subset_data,label = T)
table(Idents(subset_data))
head(subset_data@meta.data)
all_degs=data.frame()
gsub(pattern = "/",replacement = "_",x = "yofibroblast/vascular smooth muscle cell degs.csv")
for (eachgroup in levels(subset_data$Cell_subtype) ) {
# # eachgroup="nLung"
# subset_data2=subset_data
# Idents(subset_data2)=subset_data2$stim
# subset_data2=subset(subset_data2,idents=eachgroup)
#
# Idents(subset_data2)=subset_data2$Cell_subtype
# table(Idents(subset_data2))
# degs=FindMarkers(subset_data2,ident.1 = "SiO2_56",ident.2 = "NS_56")
subset_data2=subset(subset_data,idents = eachgroup)
Idents(subset_data2)=subset_data2$stim
degs=FindMarkers(subset_data2,ident.1 = "SiO2_56",ident.2 = "NS_56")
degs$gene=rownames(degs)
degs$group=eachgroup
#防止类似文件名称引起的报错 Myofibroblast/vascular smooth muscle cell degs.csv': No such file or directory
write.csv(degs,file = paste(gsub(pattern = "/","_",x = eachgroup),"degs.csv"))
print(paste0(eachgroup,"__done"))
all_degs=rbind(degs,all_degs)
}
head(all_degs)
dim(all_degs)
write.csv(all_degs,file = "./all_degs.csv")
getwd()
#all_degs=read.csv()
##########################----------------------enrichment analysis==================================================
#https://mp.weixin.qq.com/s/WyT-7yKB9YKkZjjyraZdPg
df=all_degs
##筛选阈值确定:p<0.05,|log2FC|>1
p_val_adj = 0.05
avg_log2FC = 0.6
#根据阈值添加上下调分组标签:
df$direction <- case_when(
df$avg_log2FC > avg_log2FC & df$p_val_adj < p_val_adj ~ "up",
df$avg_log2FC < -avg_log2FC & df$p_val_adj < p_val_adj ~ "down",
TRUE ~ 'none'
)
head(df)
df=df[df$direction!="none",]
head(df)
dim(df)
df$mygroup=paste(df$group,df$direction,sep = "_")
head(df)
dim(df)
##########################----------------------enrichment analysis==================================================
#https://mp.weixin.qq.com/s/WyT-7yKB9YKkZjjyraZdPg
{
library(clusterProfiler)
library(org.Hs.eg.db) #人
library(org.Mm.eg.db) #鼠
library(ggplot2)
# degs_for_nlung_vs_tlung$gene=rownames(degs_for_nlung_vs_tlung)
sce.markers=df
head(sce.markers)
ids=bitr(sce.markers$gene,'SYMBOL','ENTREZID',"org.Mm.eg.db") #'org.Hs.eg.db'
head(ids)
head(sce.markers)
sce.markers=merge(sce.markers,ids,by.x='gene',by.y='SYMBOL')
head(sce.markers)
dim(sce.markers)
sce.markers=sce.markers[sce.markers$group!="none",]
dim(sce.markers)
head(sce.markers)
sce.markers$cluster=sce.markers$mygroup
dim(sce.markers)
gcSample=split(sce.markers$ENTREZID, sce.markers$cluster)
gcSample # entrez id , compareCluster
print("===========开始go============")
xx <- compareCluster(gcSample, fun="enrichGO",OrgDb="org.Mm.eg.db" , #'org.Hs.eg.db',
pvalueCutoff=0.05) #organism="hsa",
print("===========开始 kegg============")
gg<-clusterProfiler::compareCluster(gcSample,fun = "enrichKEGG",
keyType = 'kegg', #KEGG 富集
organism='mmu',#"rno",
pvalueCutoff = 0.05 #指定 p 值阈值(可指定 1 以输出全部
)
p=dotplot(xx)
p=p+ theme(axis.text.x = element_text(angle = 90,
vjust = 0.5, hjust=0.5))
p
ggsave('degs_compareCluster-GO_enrichment.pdf',plot = p,width = 17,height = 25,limitsize = F)
ggsave('degs_compareCluster-GO_enrichment.png',plot = p,width = 17,height = 25,limitsize = F)
xx
write.csv(xx,file = "compareCluster-GO_enrichment.csv")
p=dotplot(gg)
p4=p+ theme(axis.text.x = element_text(angle = 90,
vjust = 0.5, hjust=0.5))
p4
print(paste("保存位置",getwd(),sep = " : "))
ggsave('degs_compareCluster-KEGG_enrichment-2.pdf',plot = p4,width = 16,height = 25,limitsize = F)
ggsave('degs_compareCluster-KEGG_enrichment-2.png',plot = p4,width = 16,height = 25,limitsize = F)
gg
openxlsx::write.xlsx(gg,file = "compareCluster-KEGG_enrichment.xlsx")
if (F) { #大鼠
print("===========开始go============")
xx <-clusterProfiler::compareCluster(gcSample, fun="enrichGO",OrgDb="org.Rn.eg.db",
readable=TRUE,
ont = 'ALL', #GO Ontology,可选 BP、MF、CC,也可以指定 ALL 同时计算 3 者
pvalueCutoff=0.05) #organism="hsa", #'org.Hs.eg.db',
print("===========开始 kegg============")
gg<-clusterProfiler::compareCluster(gcSample,fun = "enrichKEGG",
keyType = 'kegg', #KEGG 富集
organism="rno",
pvalueCutoff = 0.05 #指定 p 值阈值(可指定 1 以输出全部
)
p=dotplot(xx)
p2=p+ theme(axis.text.x = element_text(angle = 90,
vjust = 0.5, hjust=0.5))
p2
ggsave('degs_compareCluster-GO_enrichment-2.pdf',plot = p2,width = 6,height = 20,limitsize = F)
xx
openxlsx::write.xlsx(xx,file = "compareCluster-GO_enrichment.xlsx")
# -----
p=dotplot(gg)
p4=p+ theme(axis.text.x = element_text(angle = 90,
vjust = 0.5, hjust=0.5))
p4
print(paste("保存位置",getwd(),sep = " : "))
ggsave('degs_compareCluster-KEGG_enrichment-2.pdf',plot = p4,width = 6,height = 12,limitsize = F)
gg
openxlsx::write.xlsx(gg,file = "compareCluster-KEGG_enrichment.xlsx")
}
}
https://mp.weixin.qq.com/s/0lSC52suQVxsh8n2qYlgDw