Closed ixxmu closed 5 months ago
评审环节开始!
写在前面
一、复写内容
library(devtools)
install_github("immunogenomics/harmony")
library(Seurat)
library(magrittr)
library(harmony)
library(dplyr)
library(ggplot2)
library(monocle)
#Kidney data loading 并构建seurat object
file.link("kidney1/GSM4145204_kidney1_barcodes.tsv.gz", "kidney1/barcodes.tsv.gz")
file.link("kidney1/GSM4145204_kidney1_features.tsv.gz", "kidney1/features.tsv.gz")
file.link("kidney1/GSM4145204_kidney1_matrix.mtx.gz", "kidney1/matrix.mtx.gz")
K1.data <- Read10X(data.dir = "kidney1/")
K1 <- CreateSeuratObject(counts = K1.data, project = "kidney1", min.cells = 8, min.features = 200)
file.link("kidney2/GSM4145205_kidney2_barcodes.tsv.gz", "kidney2/barcodes.tsv.gz")
file.link("kidney2/GSM4145205_kidney2_features.tsv.gz", "kidney2/features.tsv.gz")
file.link("kidney2/GSM4145205_kidney2_matrix.mtx.gz", "kidney2/matrix.mtx.gz")
K2.data <- Read10X(data.dir = "kidney2/")
K2 <- CreateSeuratObject(counts = K2.data, project = "kidney2", min.cells = 8, min.features = 200)
file.link("kidney3/GSM4145206_kidney3_barcodes.tsv.gz", "kidney3/barcodes.tsv.gz")
file.link("kidney3/GSM4145206_kidney3_features.tsv.gz", "kidney3/features.tsv.gz")
file.link("kidney3/GSM4145206_kidney3_matrix.mtx.gz", "kidney3/matrix.mtx.gz")
K3.data <- Read10X(data.dir = "kidney3/")
K3 <- CreateSeuratObject(counts = K3.data, project = "kidney3", min.cells = 8, min.features = 200)
kid <- merge(x = K1, y = list(K2, K3)) #读取文件并用merge函数进行合并
# quality control
kid[["percent.mt"]] <- PercentageFeatureSet(kid, pattern = "^MT-") #提取有关线粒体的基因
VlnPlot(kid, features = c("nFeature_RNA", "nCount_RNA", "percent.mt"), ncol = 3) #由图可以看出分布还可以
plot1 <- FeatureScatter(kid, feature1 = "nCount_RNA", feature2 = "percent.mt")
plot2 <- FeatureScatter(kid, feature1 = "nCount_RNA", feature2 = "nFeature_RNA")
CombinePlots(plots = list(plot1, plot2))
kid <- subset(kid, subset = nFeature_RNA > 200 & nFeature_RNA < 2500 & percent.mt < 30) #筛选条件
kid <- NormalizeData(kid, normalization.method = "LogNormalize", scale.factor = 10000)
kid <- NormalizeData(kid) #标准化
kid <- FindVariableFeatures(kid, selection.method = "vst", nfeatures = 2000) #查找高变基因
top10 <- head(VariableFeatures(kid), 10)
plot1 <- VariableFeaturePlot(kid)
plot2 <- LabelPoints(plot = plot1, points = top10, repel = TRUE, xnudge = 0, ynudge = 0)
CombinePlots(plots = list(plot1, plot2))
ggsave("my_plot.png", plot2, width = 10, height = 6, units = "in")##图片因窗口问题,无法显示,选择直接保存查看
# 计算细胞周期
s.genes <-cc.genes$s.genes
g2m.genes<-cc.genes$g2m.genes
kid <- CellCycleScoring(kid, s.features = s.genes, g2m.features = g2m.genes, set.ident = TRUE)
all.genes <- rownames(kid)
kid <- ScaleData(kid, vars.to.regress = c("S.Score", "G2M.Score"), features = all.genes)
##此步骤相当耗费时间,根据文献所得细胞周期必然对单细胞测序结果的干扰,此处仅呈现代码
kid <- ScaleData(kid, features = rownames(kid))
kid <- RunPCA(kid , features = c(s.genes, g2m.genes))
DimPlot(kid)
#Eliminate batch effects with harmony and cell classification
kid <- RunPCA(kid, pc.genes = kid@var.genes, npcs = 20, verbose = FALSE)
options(repr.plot.height = 2.5, repr.plot.width = 6)
kid <- kid %>%
RunHarmony("orig.ident", plot_convergence = TRUE) #等候时间较长,请溜达溜达吧
harmony_embeddings <- Embeddings(kid, 'harmony')
harmony_embeddings[1:5, 1:5]
kid <- kid %>%
RunUMAP(reduction = "harmony", dims = 1:20) %>%
FindNeighbors(reduction = "harmony", dims = 1:20) %>%
FindClusters(resolution = 0.25) %>%
identity()
new.cluster.ids <- c(0,1, 2, 3, 4, 5, 6, 7,8,9,10)
names(new.cluster.ids) <- levels(kid)
kid <- RenameIdents(kid, new.cluster.ids)
#Calculating differentially expressed genes (DEGs) and Save rds file
kid.markers <- FindAllMarkers(kid, only.pos = TRUE, min.pct = 0.25, thresh.use = 0.25)#寻找高变基因
write.table(kid.markers,sep="\t",file="/home/yuzhenyuan/Seurat/0.2_20.xls")
write.table(kid.markers,sep="\t",file="kid.markers.xls")
saveRDS(kid,file="kid.rds")
#Some visual figure generation
DimPlot(kid, reduction = "umap", group.by = "orig.ident", pt.size = .1, split.by = 'orig.ident')#按样本进行划分
DimPlot(kid, reduction = "umap", group.by = "Phase", pt.size = .1) #按照细胞周期进行划分
DimPlot(kid, reduction = "umap", label = TRUE, pt.size = .1)
#注意作者在用同样参数设置后分为10个clusters,其实无关紧要,都需要通过marker重新贴现。
kid <- RenameIdents(object = kid,
"0" = "Proximal convoluted tubule cells",
"1" = "Proximal tubule cells",
"2" = "Proximal straight tubule cells",
"3" = "NK-T cells",
"4" = "Monocytes",
"5" = "Glomerular parietal epithelial cells",
"6" = "Distal tubule cells",
"7" = "Collecting duct principal cells",
"8" = "B cells",
"9" = "Collecting duct intercalated cells"
)
cell 3 <- DimPlot(object = kid,
reduction = "umap",
label = TRUE,
label.size = 3,
repel = TRUE)
ggsave("cell 3.png", a, width = 10, height = 6, units = "in")
#热图
DoHeatmap(object = kid, features = c("SLC13A3","SLC34A1","GPX3","DCXR","SLC17A3","SLC22A8","SLC22A7","GNLY","NKG7","CD3D","CD3E","LYZ","CD14","KRT8","KRT18","CD24","VCAM1","UMOD","DEFB1","CLDN8","AQP2","CD79A","CD79B","ATP6V1G3","ATP6V0D2","TMEM213"), size = 3 )
# 绘制部分基因热图
##小提琴图
VlnPlot(kid, pt.size =0,
idents= c("Proximal tubule cells","Proximal straight tubule cells","NK-T cells"),
features = c("GPX3", "DCXR","SLC13A3","SLC34A1","SLC22A8","SLC22A7"))
VlnPlot(kid,
idents= c("B cells","Collecting duct intercalated cells"),
features = c("AQP2", "ATP6V1B1","ATP6V0D2","ATP6V1G3"))
##tSNE Plot
kid <-RunTSNE(kid, reduction = "harmony", dims = 1:20)
TSNEPlot(kid, label = TRUE, pt.size = 1)
TSNEPlot(kid, pt.size = 1, group.by = "orig.ident", split.by = 'orig.ident')
TSNEPlot(kid, pt.size = 1, group.by = "Phase")
#Select a subset of PT cells
PT <- subset(x = kid, idents= c("Proximal convoluted tubule cells","Proximal tubule cells","Proximal straight tubule cells"))
saveRDS(PT,file="PT.rds")
#3个特征文件转换成CellDataSet格式
#PT <- readRDS(file = "PT.rds")
data <- as(as.matrix(PT@assays$RNA@counts), 'sparseMatrix')
pd <- new('AnnotatedDataFrame', data = PT@meta.data)
fData <- data.frame(gene_short_name = row.names(data), row.names = row.names(data))
fd <- new('AnnotatedDataFrame', data = fData)
my_cds <- newCellDataSet(data,
phenoData = pd,
featureData = fd,
lowerDetectionLimit = 0.5,
expressionFamily = negbinomial.size())
#将需要进行monocle的对象进行归一化
my_cds <- estimateSizeFactors(my_cds)#计算每个单细胞样本的规范化因子(size factors)
my_cds <- estimateDispersions(my_cds)#估计每个基因的离散度(dispersion)
my_cds <- detectGenes(my_cds, min_expr = 0.1) ##过滤掉低质量的基因。
pData(my_cds)$UMI <- Matrix::colSums(exprs(my_cds))
disp_table <- dispersionTable(my_cds)
#head(disp_table)
table(disp_table$mean_expression>=0.1)
unsup_clustering_genes <- subset(disp_table, mean_expression >= 0.1)#细胞平均表达量大于0.1
my_cds <- setOrderingFilter(my_cds, unsup_clustering_genes$gene_id)
plot_ordering_genes(my_cds)
expressed_genes <- row.names(subset(fData(my_cds), num_cells_expressed >= 10)) #细胞表达个数大于10
my_cds_subset <- my_cds[expressed_genes, ]
#my_cds_subset
#head(pData(my_cds_subset))
my_cds_subset <- detectGenes(my_cds_subset, min_expr = 0.1)
fData(my_cds_subset)$use_for_ordering <- fData(my_cds_subset)$num_cells_expressed > 0.05 * ncol(my_cds_subset)
table(fData(my_cds_subset)$use_for_ordering)
plot_pc_variance_explained(my_cds_subset, return_all = FALSE)
01
二、测试文件及代码
如果你也想学习本文的代码和思路,我们给大家准备好了测试文件和代码。
链接:https://pan.baidu.com/s/1eBQcA4OMrJNsouWWhzK-uw?pwd=n6lj
02
三、合作方介绍
北京寻因生物科技有限公司(简称寻因生物)成立于2018年,秉承“用技术温暖生命,预见未来”的企业愿景,致力于全球单细胞市场标准普世化发展,推动单细胞技术在早筛、临床、药研等应用方向的深耕落地,为全球科学家、技术公司及药企工业等客户提供可信赖的高品质单细胞技术服务及实验室建设解决方案。
科研者之家(www.home-for-researchers)是目前国内最知名的科研工具箱平台,以“科研不停,神器不止”为理念,自2019年上线以来,多款工具火爆科研界,如AI写作助手、Figdraw在线绘图平台、生信零代码平台、AI润色、DLP一键改写降重、中文版Pubmed、课题思路助手等。截止目前,包括Nature,Advanced Materials等顶级期刊在内,科研者之家相关科研工具被论文引用已超2300次之多。
西柚云超算:
03
四、往期作品回顾
04
如何联系我们
评审环节开始!
写在前面
一、复写内容
library(devtools)
install_github("immunogenomics/harmony")
library(Seurat)
library(magrittr)
library(harmony)
library(dplyr)
library(ggplot2)
library(monocle)
#Kidney data loading 并构建seurat object
file.link("kidney1/GSM4145204_kidney1_barcodes.tsv.gz", "kidney1/barcodes.tsv.gz")
file.link("kidney1/GSM4145204_kidney1_features.tsv.gz", "kidney1/features.tsv.gz")
file.link("kidney1/GSM4145204_kidney1_matrix.mtx.gz", "kidney1/matrix.mtx.gz")
K1.data <- Read10X(data.dir = "kidney1/")
K1 <- CreateSeuratObject(counts = K1.data, project = "kidney1", min.cells = 8, min.features = 200)
file.link("kidney2/GSM4145205_kidney2_barcodes.tsv.gz", "kidney2/barcodes.tsv.gz")
file.link("kidney2/GSM4145205_kidney2_features.tsv.gz", "kidney2/features.tsv.gz")
file.link("kidney2/GSM4145205_kidney2_matrix.mtx.gz", "kidney2/matrix.mtx.gz")
K2.data <- Read10X(data.dir = "kidney2/")
K2 <- CreateSeuratObject(counts = K2.data, project = "kidney2", min.cells = 8, min.features = 200)
file.link("kidney3/GSM4145206_kidney3_barcodes.tsv.gz", "kidney3/barcodes.tsv.gz")
file.link("kidney3/GSM4145206_kidney3_features.tsv.gz", "kidney3/features.tsv.gz")
file.link("kidney3/GSM4145206_kidney3_matrix.mtx.gz", "kidney3/matrix.mtx.gz")
K3.data <- Read10X(data.dir = "kidney3/")
K3 <- CreateSeuratObject(counts = K3.data, project = "kidney3", min.cells = 8, min.features = 200)
kid <- merge(x = K1, y = list(K2, K3)) #读取文件并用merge函数进行合并
# quality control
kid[["percent.mt"]] <- PercentageFeatureSet(kid, pattern = "^MT-") #提取有关线粒体的基因
VlnPlot(kid, features = c("nFeature_RNA", "nCount_RNA", "percent.mt"), ncol = 3) #由图可以看出分布还可以
plot1 <- FeatureScatter(kid, feature1 = "nCount_RNA", feature2 = "percent.mt")
plot2 <- FeatureScatter(kid, feature1 = "nCount_RNA", feature2 = "nFeature_RNA")
CombinePlots(plots = list(plot1, plot2))
kid <- subset(kid, subset = nFeature_RNA > 200 & nFeature_RNA < 2500 & percent.mt < 30) #筛选条件
kid <- NormalizeData(kid, normalization.method = "LogNormalize", scale.factor = 10000)
kid <- NormalizeData(kid) #标准化
kid <- FindVariableFeatures(kid, selection.method = "vst", nfeatures = 2000) #查找高变基因
top10 <- head(VariableFeatures(kid), 10)
plot1 <- VariableFeaturePlot(kid)
plot2 <- LabelPoints(plot = plot1, points = top10, repel = TRUE, xnudge = 0, ynudge = 0)
CombinePlots(plots = list(plot1, plot2))
ggsave("my_plot.png", plot2, width = 10, height = 6, units = "in")##图片因窗口问题,无法显示,选择直接保存查看
# 计算细胞周期
s.genes <-cc.genes$s.genes
g2m.genes<-cc.genes$g2m.genes
kid <- CellCycleScoring(kid, s.features = s.genes, g2m.features = g2m.genes, set.ident = TRUE)
all.genes <- rownames(kid)
kid <- ScaleData(kid, vars.to.regress = c("S.Score", "G2M.Score"), features = all.genes)
##此步骤相当耗费时间,根据文献所得细胞周期必然对单细胞测序结果的干扰,此处仅呈现代码
kid <- ScaleData(kid, features = rownames(kid))
kid <- RunPCA(kid , features = c(s.genes, g2m.genes))
DimPlot(kid)
#Eliminate batch effects with harmony and cell classification
kid <- RunPCA(kid, pc.genes = kid@var.genes, npcs = 20, verbose = FALSE)
options(repr.plot.height = 2.5, repr.plot.width = 6)
kid <- kid %>%
RunHarmony("orig.ident", plot_convergence = TRUE) #等候时间较长,请溜达溜达吧
harmony_embeddings <- Embeddings(kid, 'harmony')
harmony_embeddings[1:5, 1:5]
kid <- kid %>%
RunUMAP(reduction = "harmony", dims = 1:20) %>%
FindNeighbors(reduction = "harmony", dims = 1:20) %>%
FindClusters(resolution = 0.25) %>%
identity()
new.cluster.ids <- c(0,1, 2, 3, 4, 5, 6, 7,8,9,10)
names(new.cluster.ids) <- levels(kid)
kid <- RenameIdents(kid, new.cluster.ids)
#Calculating differentially expressed genes (DEGs) and Save rds file
kid.markers <- FindAllMarkers(kid, only.pos = TRUE, min.pct = 0.25, thresh.use = 0.25)#寻找高变基因
write.table(kid.markers,sep="\t",file="/home/yuzhenyuan/Seurat/0.2_20.xls")
write.table(kid.markers,sep="\t",file="kid.markers.xls")
saveRDS(kid,file="kid.rds")
#Some visual figure generation
DimPlot(kid, reduction = "umap", group.by = "orig.ident", pt.size = .1, split.by = 'orig.ident')#按样本进行划分
DimPlot(kid, reduction = "umap", group.by = "Phase", pt.size = .1) #按照细胞周期进行划分
DimPlot(kid, reduction = "umap", label = TRUE, pt.size = .1)
#注意作者在用同样参数设置后分为10个clusters,其实无关紧要,都需要通过marker重新贴现。
kid <- RenameIdents(object = kid,
"0" = "Proximal convoluted tubule cells",
"1" = "Proximal tubule cells",
"2" = "Proximal straight tubule cells",
"3" = "NK-T cells",
"4" = "Monocytes",
"5" = "Glomerular parietal epithelial cells",
"6" = "Distal tubule cells",
"7" = "Collecting duct principal cells",
"8" = "B cells",
"9" = "Collecting duct intercalated cells"
)
cell 3 <- DimPlot(object = kid,
reduction = "umap",
label = TRUE,
label.size = 3,
repel = TRUE)
ggsave("cell 3.png", a, width = 10, height = 6, units = "in")
#热图
DoHeatmap(object = kid, features = c("SLC13A3","SLC34A1","GPX3","DCXR","SLC17A3","SLC22A8","SLC22A7","GNLY","NKG7","CD3D","CD3E","LYZ","CD14","KRT8","KRT18","CD24","VCAM1","UMOD","DEFB1","CLDN8","AQP2","CD79A","CD79B","ATP6V1G3","ATP6V0D2","TMEM213"), size = 3 )
# 绘制部分基因热图
##小提琴图
VlnPlot(kid, pt.size =0,
idents= c("Proximal tubule cells","Proximal straight tubule cells","NK-T cells"),
features = c("GPX3", "DCXR","SLC13A3","SLC34A1","SLC22A8","SLC22A7"))
VlnPlot(kid,
idents= c("B cells","Collecting duct intercalated cells"),
features = c("AQP2", "ATP6V1B1","ATP6V0D2","ATP6V1G3"))
##tSNE Plot
kid <-RunTSNE(kid, reduction = "harmony", dims = 1:20)
TSNEPlot(kid, label = TRUE, pt.size = 1)
TSNEPlot(kid, pt.size = 1, group.by = "orig.ident", split.by = 'orig.ident')
TSNEPlot(kid, pt.size = 1, group.by = "Phase")
#Select a subset of PT cells
PT <- subset(x = kid, idents= c("Proximal convoluted tubule cells","Proximal tubule cells","Proximal straight tubule cells"))
saveRDS(PT,file="PT.rds")
#3个特征文件转换成CellDataSet格式
#PT <- readRDS(file = "PT.rds")
data <- as(as.matrix(PT@assays$RNA@counts), 'sparseMatrix')
pd <- new('AnnotatedDataFrame', data = PT@meta.data)
fData <- data.frame(gene_short_name = row.names(data), row.names = row.names(data))
fd <- new('AnnotatedDataFrame', data = fData)
my_cds <- newCellDataSet(data,
phenoData = pd,
featureData = fd,
lowerDetectionLimit = 0.5,
expressionFamily = negbinomial.size())
#将需要进行monocle的对象进行归一化
my_cds <- estimateSizeFactors(my_cds)#计算每个单细胞样本的规范化因子(size factors)
my_cds <- estimateDispersions(my_cds)#估计每个基因的离散度(dispersion)
my_cds <- detectGenes(my_cds, min_expr = 0.1) ##过滤掉低质量的基因。
pData(my_cds)$UMI <- Matrix::colSums(exprs(my_cds))
disp_table <- dispersionTable(my_cds)
#head(disp_table)
table(disp_table$mean_expression>=0.1)
unsup_clustering_genes <- subset(disp_table, mean_expression >= 0.1)#细胞平均表达量大于0.1
my_cds <- setOrderingFilter(my_cds, unsup_clustering_genes$gene_id)
plot_ordering_genes(my_cds)
expressed_genes <- row.names(subset(fData(my_cds), num_cells_expressed >= 10)) #细胞表达个数大于10
my_cds_subset <- my_cds[expressed_genes, ]
#my_cds_subset
#head(pData(my_cds_subset))
my_cds_subset <- detectGenes(my_cds_subset, min_expr = 0.1)
fData(my_cds_subset)$use_for_ordering <- fData(my_cds_subset)$num_cells_expressed > 0.05 * ncol(my_cds_subset)
table(fData(my_cds_subset)$use_for_ordering)
plot_pc_variance_explained(my_cds_subset, return_all = FALSE)
01
二、测试文件及代码
如果你也想学习本文的代码和思路,我们给大家准备好了测试文件和代码。
链接:https://pan.baidu.com/s/1eBQcA4OMrJNsouWWhzK-uw?pwd=n6lj
02
三、合作方介绍
北京寻因生物科技有限公司(简称寻因生物)成立于2018年,秉承“用技术温暖生命,预见未来”的企业愿景,致力于全球单细胞市场标准普世化发展,推动单细胞技术在早筛、临床、药研等应用方向的深耕落地,为全球科学家、技术公司及药企工业等客户提供可信赖的高品质单细胞技术服务及实验室建设解决方案。
科研者之家(www.home-for-researchers)是目前国内最知名的科研工具箱平台,以“科研不停,神器不止”为理念,自2019年上线以来,多款工具火爆科研界,如AI写作助手、Figdraw在线绘图平台、生信零代码平台、AI润色、DLP一键改写降重、中文版Pubmed、课题思路助手等。截止目前,包括Nature,Advanced Materials等顶级期刊在内,科研者之家相关科研工具被论文引用已超2300次之多。
西柚云超算:
03
四、往期作品回顾
04
如何联系我们
https://mp.weixin.qq.com/s/_ggmd5FpVszU5fDTdFlwTQ