satijalab / seurat

R toolkit for single cell genomics
http://www.satijalab.org/seurat
Other
2.26k stars 908 forks source link

Query re: Multiome integration and differential expression #6138

Closed jarchana09 closed 2 years ago

jarchana09 commented 2 years ago

Hi, I have 8 10X multiome (scRNA+ATAC) samples (2 groups: stimulated; unstimulated). The analysis is based on the Seurat and Signac multiome and integration vignettes. After cell annotation, I performed differential expression. The results from the differential expression analysis using FindMarkers on the peaks assay comparing expression between the 2 groups for each cell type shows statistical significance only for one of the cell types. In addition the pct values are low (~2%-50%) Please could you let me know whether I am doing something wrong while integrating the peaks?

Any insights would be immensely helpful. Please find sections of the code that I have used and session info included below.

Thank you so much. Regards, Archana

suppressPackageStartupMessages({
library(Seurat)
#library(SeuratWrappers)
library(Signac)
library(ggplot2)
library(ensembldb)
library(GenomeInfoDb)
library(EnsDb.Mmusculus.v79)
options(future.globals.maxSize = 80000 * 1024ˆ2)
options(ggrepel.max.overlaps = Inf)
})
set.seed(1234)

# After QC and filtering of samples, load samples
Sample_1L_filt <- readRDS("Sample_1L_filt.rds")
Sample_3L_filt <- readRDS("Sample_3L_filt.rds")
Sample_5L_filt <- readRDS("Sample_5L_filt.rds")
Sample_6L_filt <- readRDS("Sample_6L_filt.rds")
Sample_1W_filt <- readRDS("Sample_1W_filt.rds")
Sample_3W_filt <- readRDS("Sample_3W_filt.rds")
Sample_5W_filt <- readRDS("Sample_5W_filt.rds")
Sample_6W_filt <- readRDS("Sample_6W_filt.rds")

# Integrating stimulated and unstimulated samples together
## Merging samples
all_merged <- merge(x = Sample_1L_filt, y = c(Sample_3L_filt, Sample_5L_filt,
Sample_6L_filt, Sample_1W_filt, Sample_3W_filt, Sample_5W_filt, Sample_6W_filt),
add.cell.ids = c("Sample_1L", "Sample_3L", "Sample_5L", "Sample_6L",
"Sample_1W", "Sample_3W", "Sample_5W", "Sample_6W"))
all_merged

## Merging ATAC peaks separately
combined_peaks <- GenomicRanges::reduce(unlist(as(
c(Sample_1L_filt@assays$ATAC@ranges,
Sample_3L_filt@assays$ATAC@ranges,
Sample_5L_filt@assays$ATAC@ranges,
Sample_6L_filt@assays$ATAC@ranges,
Sample_1W_filt@assays$ATAC@ranges,
Sample_3W_filt@assays$ATAC@ranges,
Sample_5W_filt@assays$ATAC@ranges,
Sample_6W_filt@assays$ATAC@ranges),
"GRangesList")))

peakwidths <- width(combined_peaks)
combined_peaks <- combined_peaks[peakwidths < 10000 & peakwidths > 20]

#Quantify peaks in the merged dataset to create a matrix of peaks x cell using the FeatureMatrix function
merged_atac_counts <- FeatureMatrix(fragments =all_merged@assays$ATAC@fragments, features = combined_peaks, 
cells = colnames(all_merged))

#add the quantified matrices to ATAC assay slot of the merged Seurat object
all_merged[["ATAC"]] <- CreateChromatinAssay(
counts = merged_atac_counts, fragments = all_merged@assays$ATAC@fragments, 
annotation = all_merged@assays$ATAC@annotation, sep = c(":", "-"), genome = "mm10")

#redo peak calling using MAC2 for the combined Seurat object, with the group.by parameter set to the 
#orig.ident (sample ID) to identify peaks for each sample separately
combined_peaks_macs2 <- CallPeaks(all_merged, assay = "ATAC", group.by = "orig.ident")

# remove peaks on nonstandard chromosomes and in genomic blacklist regions
combined_peaks_macs2 <- keepStandardChromosomes(combined_peaks_macs2, pruning.mode = "coarse")
combined_peaks_macs2 <- subsetByOverlaps(x = combined_peaks_macs2, ranges = blacklist_mm10, invert = TRUE)
#blacklist_mm10 from Signac developers https://satijalab.org/signac/reference/blacklist_mm10.html

# quantify counts in each peak
macs2_counts_combined <- FeatureMatrix(fragments = all_merged@assays$ATAC@fragments, 
features = combined_peaks_macs2, cells = colnames(all_merged) )
all_merged[["peaks"]] <- CreateChromatinAssay(counts = macs2_counts_combined, 
fragments = all_merged@assays$ATAC@fragments, 
annotation = all_merged@assays$ATAC@annotation, genome = "mm10")

## Normalization: RNA <br>
all_merged_list <- Seurat::SplitObject(object = all_merged, split.by = "sample_id")
all_merged_list <- lapply(X = all_merged_list, FUN = SCTransform, assay = "RNA", vars.to.regress = "percent.mt", 
verbose = FALSE)

# Integration of all samples: RNA
features_all_merged <- Seurat::SelectIntegrationFeatures(object.list = all_merged_list, nfeatures = 3000)
all_merged_list <- Seurat::PrepSCTIntegration(object.list = all_merged_list, anchor.features = features_all_merged)
all_merged_list <- lapply(X = all_merged_list, FUN = RunPCA, features = features_all_merged)
set.seed(42)
all_merged_anchors <- Seurat::FindIntegrationAnchors(object.list = all_merged_list, normalization.method = "SCT", 
anchor.features = features_all_merged, dims = 1:30, reduction = "rpca", k.anchor = 5, verbose = FALSE)

all_integrated <- Seurat::IntegrateData(anchorset = all_merged_anchors, normalization.method = "SCT", dims = 1:30)
#Linear dimensional reduction of RNA assay data: PCA

all_integrated <- Seurat::RunPCA(object = all_integrated, verbose = FALSE)

## Normalization and integration of all samples: ATAC <br>
DefaultAssay(all_integrated) <- "peaks"
all_integrated <- FindTopFeatures(all_integrated, min.cutoff = 'q0', assay = "peaks")
all_integrated <- RunTFIDF(all_integrated, method = 1)
#Linear dimension reduction
all_integrated <- RunSVD(all_integrated)
ElbowPlot(all_integrated, ndims = 50, reduction = "lsi")
DepthCor(all_integrated, n = 50, reduction = "lsi", assay = "peaks")

## Integration
all_integration_anchors_atac <- FindIntegrationAnchors(object.list = SplitObject(all_integrated, "orig.ident"), 
anchor.features = rownames(all_integrated), reduction = "rlsi", dims = 2:30)
all_integrated_atac <- IntegrateEmbeddings(anchorset = all_integration_anchors_atac, 
reductions = all_integrated[["lsi"]], 
new.reduction.name = "integrated_lsi", 
dims.to.integrate = 1:30)
#Adding the integrated ATAC data to the integrated RNA data
all_integrated[["integrated_lsi_atac"]] <- all_integrated_atac[["integrated_lsi"]]

#====================================
# After annotation of cell types, performing differential expression analysis
#Differentially accessible peaks between groups using logistic regression (LR)
DefaultAssay(object = all_integrated_rnm) <- "peaks"

get_differential_peaks <- function(celltype){
Seurat::FindMarkers(object = all_integrated_rnm, 
ident.1 = paste0(celltype, "_Stimulated"), 
ident.2 = paste0(celltype, "_Unstimulated"), 
verbose = FALSE, assay = "peaks", min.pct = 0.05, test.use = "LR", 
latent.vars = "nCount_peaks", logfc.threshold = 0) %>% 
rownames_to_column(var = "peaks") %>% 
cbind(cell_type = celltype, .)}
differential_peaks_all_integrated_rnm <- map_dfr(tmp_celltype_list, get_differential_peaks)

#===========================================================
# sessionInfo()
#> R version 4.1.2 (2021-11-01)
#> Platform: x86_64-pc-linux-gnu (64-bit)
#> Running under: CentOS Linux 7 (Core)
#> Matrix products: default
#> BLAS: /share/pkg.7/r/4.1.2/install/lib64/R/lib/libRblas.so
#> LAPACK: /share/pkg.7/r/4.1.2/install/lib64/R/lib/libRlapack.so
#> locale:
#> [1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C
#> [3] LC_TIME=en_US.UTF-8 LC_COLLATE=en_US.UTF-8
#> [5] LC_MONETARY=en_US.UTF-8 LC_MESSAGES=en_US.UTF-8
#> [7] LC_PAPER=en_US.UTF-8 LC_NAME=C
#> [9] LC_ADDRESS=C LC_TELEPHONE=C
#> [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
#> attached base packages:
#> [1] parallel stats4 stats graphics grDevices utils datasets
#> [8] methods base
#> other attached packages:
#> [1] chromVAR_1.16.0 ggseqlogo_0.1
#> [3] motifmatchr_1.16.0 TFBSTools_1.32.0
#> [5] JASPAR2020_0.99.10 BSgenome.Mmusculus.UCSC.mm10_1.4.3
#> [7] BSgenome_1.60.0 rtracklayer_1.52.1
#> [9] Biostrings_2.60.2 XVector_0.32.0
#> [11] metap_1.7 multtest_2.48.0
#> [13] EnsDb.Mmusculus.v79_2.99.0 ensembldb_2.18.4
#> [15] AnnotationFilter_1.16.0 GenomicFeatures_1.46.5
#> [17] AnnotationDbi_1.56.2 ggrepel_0.9.1
#> [19] Matrix_1.4-1 scales_1.1.1
#> [21] writexl_1.4.0 SingleCellExperiment_1.16.0
#> [23] SummarizedExperiment_1.22.0 Biobase_2.52.0
#> [25] GenomicRanges_1.44.0 GenomeInfoDb_1.30.1
#> [27] IRanges_2.26.0 S4Vectors_0.30.2
#> [29] BiocGenerics_0.38.0 MatrixGenerics_1.4.3
#> [31] matrixStats_0.61.0 scDblFinder_1.8.0
#> [33] DT_0.20 forcats_0.5.1
#> [35] stringr_1.4.0 purrr_0.3.4
#> [37] readr_2.1.2 tidyr_1.2.0
#> [39] tibble_3.1.6 tidyverse_1.3.1
#> [41] ggcharts_0.2.1 magrittr_2.0.2
#> [43] future_1.23.0 RColorBrewer_1.1-2
#> [45] clustree_0.4.4 ggraph_2.0.5
#> [47] dplyr_1.0.7 patchwork_1.1.1
#> [49] ggplot2_3.3.5 Signac_1.7.0
#> [51] sp_1.4-6 SeuratObject_4.1.0
#> [53] Seurat_4.1.1
#> loaded via a namespace (and not attached):
#> [1] rsvd_1.0.5 ica_1.0-2
#> [3] RcppRoll_0.3.0 Rsamtools_2.8.0
#> [5] lmtest_0.9-39 crayon_1.4.2
#> [7] spatstat.core_2.3-2 rbibutils_2.2.7
#> [9] MASS_7.3-55 nlme_3.1-155
#> [11] backports_1.4.1 qlcMatrix_0.9.7
#> [13] reprex_2.0.1 rlang_1.0.0
#> [15] ROCR_1.0-11 readxl_1.3.1
#> [17] irlba_2.3.5 limma_3.48.3
#> [19] scater_1.20.1 filelock_1.0.2
#> [21] xgboost_1.5.0.2 BiocParallel_1.26.2
#> [23] rjson_0.2.21 CNEr_1.30.0
#> [25] bit64_4.0.5 glue_1.6.1
#> [27] poweRlaw_0.70.6 sctransform_0.3.3
#> [29] vipor_0.4.5 spatstat.sparse_2.1-0
#> [31] spatstat.geom_2.3-1 haven_2.4.3
#> [33] tidyselect_1.1.1 fitdistrplus_1.1-6
#> [35] XML_3.99-0.8 zoo_1.8-10
#> [37] GenomicAlignments_1.28.0 xtable_1.8-6
#> [39] evaluate_0.14 Rdpack_2.1.3
#> [41] scuttle_1.2.1 cli_3.1.1
#> [43] zlibbioc_1.38.0 sn_2.0.1
#> [45] rstudioapi_0.13 miniUI_0.1.1.1
#> [47] rpart_4.1.16 fastmatch_1.1-3
#> [49] mathjaxr_1.4-0 shiny_1.7.1
#> [51] BiocSingular_1.8.1 xfun_0.29
#> [53] cluster_2.1.2 caTools_1.18.2
#> [55] tidygraph_1.2.0 KEGGREST_1.32.0
#> [57] listenv_0.8.0 TFMPvalue_0.0.8
#> [59] png_0.1-7 withr_2.4.3
#> [61] bitops_1.0-7 slam_0.1-50
#> [63] ggforce_0.3.3 plyr_1.8.6
#> [65] cellranger_1.1.0 sparsesvd_0.2
#> [67] pracma_2.3.8 dqrng_0.3.0
#> [69] pillar_1.7.0 cachem_1.0.6
#> [71] multcomp_1.4-19 fs_1.5.2
#> [73] DelayedMatrixStats_1.14.3 vctrs_0.3.8
#> [75] ellipsis_0.3.2 generics_0.1.2
#> [77] rgdal_1.5-28 tools_4.1.2
#> [79] beeswarm_0.4.0 munsell_0.5.0
#> [81] tweenr_1.0.2 DelayedArray_0.18.0
#> [83] fastmap_1.1.0 compiler_4.1.2
#> [85] abind_1.4-7 httpuv_1.6.5
#> [87] plotly_4.10.0 rgeos_0.5-9
#> [89] GenomeInfoDbData_1.2.6 gridExtra_2.3
#> [91] edgeR_3.34.1 lattice_0.20-45
#> [93] deldir_1.0-6 mutoss_0.1-12
#> [95] utf8_1.2.2 later_1.3.0
#> [97] BiocFileCache_2.0.0 jsonlite_1.7.3
#> [99] docopt_0.7.1 ScaledMatrix_1.0.0
#> [101] pbapply_1.5-0 sparseMatrixStats_1.4.2
#> [103] lazyeval_0.2.2 promises_1.2.0.1
#> [105] R.utils_2.11.0 goftest_1.2-3
#> [107] spatstat.utils_2.3-0 reticulate_1.24
#> [109] rmarkdown_2.11 sandwich_3.0-2
#> [111] cowplot_1.1.1 statmod_1.4.36
#> [113] Rtsne_0.15 uwot_0.1.11
#> [115] igraph_1.2.11 survival_3.2-13
#> [117] numDeriv_2020.2-1 yaml_2.2.2
#> [119] plotrix_3.8-2 htmltools_0.5.2
#> [121] memoise_2.0.1 BiocIO_1.2.0
#> [123] locfit_1.5-9.4 graphlayouts_0.8.0
#> [125] viridisLite_0.4.0 digest_0.6.29
#> [127] assertthat_0.2.1 mime_0.12
#> [129] rappdirs_0.3.3 RSQLite_2.2.9
#> [131] future.apply_1.8.1 data.table_1.14.2
#> [133] blob_1.2.2 R.oo_1.24.0
#> [135] splines_4.1.2 ProtGenerics_1.24.0
#> [137] RCurl_1.98-1.5 broom_0.7.12
#> [139] hms_1.1.1 modelr_0.1.8
#> [141] colorspace_2.0-3 mnormt_2.0.2
#> [143] ggbeeswarm_0.6.0 tmvnsim_1.0-2
#> [145] Rcpp_1.0.8 RANN_2.6.1
#> [147] mvtnorm_1.1-3 fansi_1.0.2
#> [149] tzdb_0.2.0 parallelly_1.30.0
#> [151] R6_2.5.1 grid_4.1.2
#> [153] ggridges_0.5.3 lifecycle_1.0.1
#> [155] TFisher_0.2.0 bluster_1.2.1
#> [157] curl_4.3.2 leiden_0.3.9
#> [159] RcppAnnoy_0.0.19 TH.data_1.1-0
#> [161] htmlwidgets_1.5.4 beachmat_2.8.1
#> [163] polyclip_1.10-0 biomaRt_2.48.3
#> [165] seqLogo_1.60.0 rvest_1.0.2
#> [167] mgcv_1.8-38 globals_0.14.0
#> [169] progressr_0.10.0 codetools_0.2-18
#> [171] lubridate_1.8.0 GO.db_3.13.0
#> [173] metapod_1.0.0 gtools_3.9.2
#> [175] prettyunits_1.1.1 dbplyr_2.1.1
#> [177] R.methodsS3_1.8.1 gtable_0.3.0
#> [179] DBI_1.1.2 tensor_1.5
#> [181] httr_1.4.2 highr_0.9
#> [183] KernSmooth_2.23-20 presto_1.0.0
#> [185] stringi_1.7.6 progress_1.2.2
#> [187] reshape2_1.4.4 farver_2.1.0
#> [189] annotate_1.70.0 viridis_0.6.2
#> [191] xml2_1.3.3 BiocNeighbors_1.10.0
#> [193] restfulr_0.0.13 scattermore_0.7
#> [195] scran_1.20.1 bit_4.0.4
#> [197] spatstat.data_2.1-2 pkgconfig_2.0.3
#> [199] DirichletMultinomial_1.36.0 knitr_1.37
#Reference source codes:
https://satijalab.org/seurat/articles/weighted_nearest_neighbor_analysis.html;
https://satijalab.org/signac/articles/pbmc_multiomic.html;
https://satijalab.org/signac/articles/mouse_brain_vignette.html;
https://github.com/quadbiolab/scMultiome_analysis_vignette/blob/main/Tutorial.pdf;
https://github.com/hbctraining/scRNA-seq_online/blob/master/lessons/09_merged_SC_marker_identification.md
jarchana09 commented 2 years ago

Answered in https://github.com/timoast/signac/discussions/1157#discussioncomment-3095424