Closed ixxmu closed 9 months ago
大家好,欢迎来的单细胞空间转录组联合分析.续上期:Spatial multi-omic map of human myocardial infarction.
本文代码:作者整合了空转的多个数据最终形成ps_integrated_slides,(使用最终获取的ps_integrated_slides文件,再结合细胞丰度文件,可以做 ILR transformation、findniches),那么这个文件是啥意思呢
作者在General differences in tissue organization部分,提到了pseudo-bulk profiles, ps_integrated_slides中的ps就是pseudo-bulk简写
Hierarchical clustering, with euclidean distances and Ward’s algorithm,was used to cluster the pseudo-bulk profiles of the spatial transcriptomics datasets (replicates where merged, n = 27). Genes with less than 100 counts in 85% of the sample size were excluded for this analysis. Log normalization (scale factor = 10,000) was performed.
To visualize the general molecular differences between our samples,log-normalized pseudo-bulk profiles of the spatial transcriptomics datasets were projected in an UMAP embedding.
这次整合的结果:1.所有空转的表达数据整合在一起,同时使用silhouette方法获取最优的聚类个数。2. 通过sumCountsAcrossCells函数获取各个细胞类型的pseudo-bulk counts。 (使用最终获取的ps_integrated_slides文件,再结合细胞丰度文件,可以做 ILR transformation、find cell-type niches)
整合的过程也是分为两步
第一步,获取integrated_slides rds文件 。方法:整合多个cellranger处理好的空转数据,即整合多个slides;使用harmony来去除批次效应;使用silhouette获得最优聚类
文件部分截图如下:
详细代码:附在本文最后
这一步中作者的整合思路代码如下:
1 FindVariableFeatures 找到每个slide的高变基因
2 gene_selection_plt 把所有的高变基因整合在一个list内
3.使用merge函数,merge所有的slide
4.harmony
slide_files <- list.files("~/silicosis/spatial/prcessed_visum_for_progeny/data/",recursive = TRUE,
all.files = TRUE,full.names = TRUE,pattern = "_")
def_assay="Spatial"
hvg_list <- map(integrated_data, function(x) {
DefaultAssay(x) <- def_assay
x <- FindVariableFeatures(x, selection.method = "vst",
nfeatures = 3000)
x@assays[[def_assay]]@var.features
}) %>% unlist()
hvg_list <- table(hvg_list) %>%
sort(decreasing = TRUE)
gene_selection_plt <- hvg_list %>% enframe() %>%
group_by(value) %>%
mutate(value = as.numeric(value)) %>%
summarize(ngenes = length(name)) %>%
ggplot(aes(x = value, y = ngenes)) +
geom_bar(stat = "identity")
gene_selection <- hvg_list[1:3000] %>% names()
integrated_data <- purrr::reduce(integrated_data,
merge,
merge.data = TRUE)
integrated_data <- integrated_data %>%
ScaleData(verbose = FALSE) %>%
RunPCA(features = gene_selection,
npcs = 30,
verbose = FALSE)
integrated_data <- RunHarmony(integrated_data,
group.by.vars = batch_covars,
plot_convergence = TRUE,
assay.use = def_assay,
max.iter.harmony = 20)
第二步,获取ps_integrated_slides和ps_integrated_slides_niches rds文件。这两个文件是通过第一步中的ps_integrated_slides_niches.rds最终获得的。 方法:使用sumCountsAcrossCells函数
文件如下:
详细代码:附在本文最后
第一步代码:
#####安装archr包##别处复制
.libPaths(c("/home/data/t040413/R/x86_64-pc-linux-gnu-library/4.2",
"/home/data/t040413/R/yll/usr/local/lib/R/site-library",
"/usr/local/lib/R/library",
"/home/data/refdir/Rlib/"))
.libPaths()
library(tidyverse)
library(Seurat)
library(optparse)
library(scater)
library(harmony)
library(dplyr)
library(optparse)
library(tidyverse)
library(Seurat)
library(harmony)
library(cluster)
library(clustree)
#https://github.com/saezlab/visium_heart/blob/5b30c7e497e06688a8448afd8d069d2fa70ebcd2/st_snRNAseq/02_snuc_integration_harmony/integrate_objects.R#L194
slide_files <- list.files("~/silicosis/spatial/prcessed_visum_for_progeny/data/",recursive = TRUE,
all.files = TRUE,full.names = TRUE,pattern = "_")
integrated_data <- map(slide_files, readRDS)
print("You managed to load everything")
print("Object size")
print(object.size(integrated_data))
# Calculate HVG per sample - Here we assume that batch and patient effects aren't as strong
# since cell-types and niches should be greater than the number of batches
Assays(integrated_data [[1]])
str(integrated_data [[1]])
def_assay="Spatial"
hvg_list <- map(integrated_data, function(x) {
DefaultAssay(x) <- def_assay
x <- FindVariableFeatures(x, selection.method = "vst",
nfeatures = 3000)
x@assays[[def_assay]]@var.features
}) %>% unlist()
hvg_list <- table(hvg_list) %>%
sort(decreasing = TRUE)
gene_selection_plt <- hvg_list %>% enframe() %>%
group_by(value) %>%
mutate(value = as.numeric(value)) %>%
summarize(ngenes = length(name)) %>%
ggplot(aes(x = value, y = ngenes)) +
geom_bar(stat = "identity")
gene_selection <- hvg_list[1:3000] %>% names()
#########
# Create merged object ---------------------------------
integrated_data <- purrr::reduce(integrated_data,
merge,
merge.data = TRUE)
print("You managed to merge everything")
print("Object size")
print(object.size(integrated_data))
# Default assay ---------------------------------------
#DefaultAssay(integrated_data) <- def_assay
# Process it before integration -----------------------
integrated_data <- integrated_data %>%
ScaleData(verbose = FALSE) %>%
RunPCA(features = gene_selection,
npcs = 30,
verbose = FALSE)
original_pca_plt <- DimPlot(object = integrated_data,
reduction = "pca",
pt.size = .1,
group.by = "orig.ident")
head(integrated_data@meta.data)
batch_covars="orig.ident"
# Integrate the data -----------------------
integrated_data <- RunHarmony(integrated_data,
group.by.vars = batch_covars,
plot_convergence = TRUE,
assay.use = def_assay,
max.iter.harmony = 20)
# Corrected dimensions -----------------------
corrected_pca_plt <- DimPlot(object = integrated_data,
reduction = "harmony",
pt.size = .1,
group.by = "orig.ident")
# Create the UMAP with new reduction -----------
integrated_data <- integrated_data %>%
RunUMAP(reduction = "harmony", dims = 1:30,
reduction.name = "umap_harmony") %>%
RunUMAP(reduction = "pca", dims = 1:30,
reduction.name = "umap_original")
integrated_data <- FindNeighbors(integrated_data,
reduction = "harmony",
dims = 1:30)
head(integrated_data@meta.data)
colnames(integrated_data@meta.data)=gsub(pattern ="_snn_res.",replacement = "_before",x = colnames(integrated_data@meta.data) )
DefaultAssay(integrated_data)
integrated_data@meta.data %>%head()
optimize=TRUE
################opt cluster--------------
if(optimize) {
# Clustering and optimization -------------------------
print("Optimizing clustering")
seq_res <- seq(0.5, 1.5, 0.1)
# seq_res=1
integrated_data <- FindClusters(integrated_data,
resolution = seq_res,
verbose = F)
clustree_plt <- clustree::clustree(integrated_data,
prefix = paste0(DefaultAssay(integrated_data), "_snn_res."))
# Optimize clustering ------------------------------------------------------
cell_dists <- dist(integrated_data@reductions$harmony@cell.embeddings,
method = "euclidean")
head(cell_dists)
cluster_info <- integrated_data@meta.data[,grepl(paste0(DefaultAssay(integrated_data),"_snn_res"),
colnames(integrated_data@meta.data))] %>%
dplyr::mutate_all(as.character) %>%
dplyr::mutate_all(as.numeric)
head(cluster_info)[,1:9]
# head(cell_dists)[,1:9]
si= silhouette(cluster_info[,1], cell_dists) %>%head()
si
silhouette_res <- apply(cluster_info, 2, function(x){
si <- silhouette(x, cell_dists)
if(!any(is.na(si))) {
mean(si[, 'sil_width'])
} else {
NA
}
})
head(silhouette_res)
integrated_data[["opt_clust_integrated"]] <- integrated_data[[names(which.max(silhouette_res))]]
Idents(integrated_data) = "opt_clust_integrated"
# Reduce meta-data -------------------------------------------------------------------------
spam_cols <- grepl(paste0(DefaultAssay(integrated_data), "_snn_res"),
colnames(integrated_data@meta.data)) |
grepl("seurat_clusters",colnames(integrated_data@meta.data))
integrated_data@meta.data <- integrated_data@meta.data[,!spam_cols]
} else {
print("Not Optimizing clustering")
seq_res <- seq(0.2, 1.6, 0.2)
integrated_data <- FindClusters(integrated_data,
resolution = seq_res,
verbose = F)
clustree_plt <- clustree(integrated_data,
prefix = paste0(DefaultAssay(integrated_data),
"_snn_res."))
integrated_data <- FindClusters(integrated_data,
resolution = default_resolution,
verbose = F)
integrated_data[["opt_clust_integrated"]] <- integrated_data[["seurat_clusters"]]
spam_cols <- grepl(paste0(DefaultAssay(integrated_data), "_snn_res"),
colnames(integrated_data@meta.data)) |
grepl("seurat_clusters",colnames(integrated_data@meta.data))
integrated_data@meta.data <- integrated_data@meta.data[,!spam_cols]
}
# Save object ------------------------------------------------------
saveRDS(integrated_data, file ="~/silicosis/spatial/integrated_slides/integrated_slides.rds" )
# integrated_data=readRDS("~/silicosis/spatial/integrated_slides/integrated_slides.rds")
# head(integrated_data@meta.data)
#
# ps_integrated_slides=readRDS("~/silicosis/spatial/integrated_slides/ps_integrated_slides.rds")
#
# Print QC file ------------------------------------------------------
umap_corrected_sample <- DimPlot(object = integrated_data,
reduction = "umap_harmony",
pt.size = .1,
group.by = "orig.ident")
umap_corrected_clustering <- DimPlot(object = integrated_data,
reduction = "umap_harmony",
pt.size = .1,
group.by = "opt_clust_integrated")
umap_sample <- DimPlot(object = integrated_data,
reduction = "umap_original",
pt.size = .1,
group.by = "orig.ident")
umap_clustering <- DimPlot(object = integrated_data,
reduction = "umap_original",
pt.size = .1,
group.by = "opt_clust_integrated")
getwd()
head(integrated_data@meta.data)
Reductions(integrated_data)
DimPlot(integrated_data,label = TRUE,reduction = "umap_harmony" )
pdf(file = "~/silicosis/spatial/integrated_slides/ump.pdf", height = 10, width = 12)
print(gene_selection_plt)
print(original_pca_plt)
print(corrected_pca_plt)
print(umap_sample)
print(umap_corrected_sample)
print(clustree_plt)
print(umap_clustering)
print(umap_corrected_clustering)
dev.off()
第二步代码:
#####安装archr包##别处复制
.libPaths(c("/home/data/t040413/R/x86_64-pc-linux-gnu-library/4.2",
"/home/data/t040413/R/yll/usr/local/lib/R/site-library",
"/usr/local/lib/R/library",
"/home/data/refdir/Rlib/"))
.libPaths()
library(optparse)
library(tidyverse)
library(Seurat)
library(harmony)
library(cluster)
library(clustree)
#https://github.com/saezlab/visium_heart/blob/5b30c7e497e06688a8448afd8d069d2fa70ebcd2/st_snRNAseq/jobs/run_spatial_pseudobulk.sh#L25
#https://github.com/saezlab/visium_heart/blob/5b30c7e497e06688a8448afd8d069d2fa70ebcd2/st_snRNAseq/06_atlas_comparison/get_pseudobulk_profiles.R#L4
#1 ps_integrated_slides_niches---------
{
# Evaluate parameters
slide_files <- "/home/data/t040413/silicosis/spatial/integrated_slides/integrated_slides.rds"
# Identify levels of pseudobulking
vars=c("orig.ident","opt_clust_integrated")
vars <- set_names( unlist(strsplit(vars, ",")) )
print(vars)
# Read data and create pseudobulk profiles at two levels ----------------------------------
get_sample_pseudo <- function(slide_file, vars, group_vars = "n") {
slide <- readRDS(slide_file)
if(group_vars == "n") {
# Creates pseudobulk profile for each var
bulk_p_data <- map(vars, function(x) {
sumCountsAcrossCells(x = GetAssayData(slide, assay ="Spatial", slot = "counts"),
ids = slide@meta.data[, x])
})
} else {
bulk_p_data <- list()
bulk_p_data[["gex"]] <- sumCountsAcrossCells(x = GetAssayData(slide, assay = def_assay, slot = "counts"),
ids = DataFrame(slide@meta.data[, vars]))
}
bulk_p_data[["annotations"]] <- slide@meta.data
return(bulk_p_data)
}
pseudobulk_profiles <- map(set_names(slide_files),
get_sample_pseudo,
vars = vars,
group_vars = "collapsed")
pseudobulk_profiles[[1]][[1]]
colData(pseudobulk_profiles[[1]][[1]])
saveRDS(pseudobulk_profiles,file = "~/silicosis/spatial/integrated_slides/ps_integrated_slides_niches.rds")
}
####2 ps_integrated_slides.rds-----
{
# Evaluate parameters
slide_files <- "/home/data/t040413/silicosis/spatial/integrated_slides/integrated_slides.rds"
# Identify levels of pseudobulking
vars=c("orig.ident")
vars <- set_names( unlist(strsplit(vars, ",")) )
print(vars)
# Read data and create pseudobulk profiles at two levels --
get_sample_pseudo <- function(slide_file, vars, group_vars = "n") {
slide <- readRDS(slide_file)
if(group_vars == "n") {
# Creates pseudobulk profile for each var
bulk_p_data <- map(vars, function(x) {
sumCountsAcrossCells(x = GetAssayData(slide, assay ="Spatial", slot = "counts"),
ids = slide@meta.data[, x])
})
} else {
bulk_p_data <- list()
bulk_p_data[["gex"]] <- sumCountsAcrossCells(x = GetAssayData(slide, assay = def_assay, slot = "counts"),
ids = DataFrame(slide@meta.data[, vars]))
}
bulk_p_data[["annotations"]] <- slide@meta.data
return(bulk_p_data)
}
pseudobulk_profiles <- map(set_names(slide_files),
get_sample_pseudo,
vars = vars,
group_vars = "n")
pseudobulk_profiles[[1]][[1]]
colData(pseudobulk_profiles[[1]][[1]])
head(pseudobulk_profiles[[1]][[2]])
#https://github.com/saezlab/visium_heart/blob/5b30c7e497e06688a8448afd8d069d2fa70ebcd2/st_snRNAseq/jobs/run_spatial_pseudobulk.sh#L25
saveRDS(pseudobulk_profiles,file = "~/silicosis/spatial/integrated_slides/ps_integrated_slides.rds")
}
https://mp.weixin.qq.com/s/ZAgpFExgKY7Xj5Ich5NWCg