satijalab / seurat

R toolkit for single cell genomics
http://www.satijalab.org/seurat
Other
2.19k stars 892 forks source link

ATAC-seq data ScaleData(): "invalid class "ChromatinAssay" object: features in 'scale.data' must be in the same order as in 'data'" #8834

Closed HaotongZhang-Pitt closed 2 months ago

HaotongZhang-Pitt commented 2 months ago

Hello, I have two layers of data: the ATAC-seq results and the HTO hashtags. I encountered the following error when scaling the data, even if I skipped the demultiplexing steps:

library(Signac)
library(Seurat)
library(EnsDb.Mmusculus.v79)
library(ggplot2)
library(patchwork)
library(Matrix)

setwd()

# DATA LOADING 
whitelist <- data.frame("ATAC" = read.table("ATAC_raw_data/whitelist_DOGMA_ATAC.txt", col.names = "ATAC"), "GEX" = read.table("ATAC_raw_data/whitelist_DOGMA_GEX.txt", col.names = "GEX"))
# HTO data loading 
hto.1 <- readMM("ATAC_raw_data/featurecounts-HTO1/featurecounts.mtx")
barcodes.hto.1 <- read.table("ATAC_raw_data/featurecounts-HTO1/featurecounts.barcodes.txt")
features.hto.1 <- read.table("ATAC_raw_data/featurecounts-HTO1/featurecounts.genes.txt")
colnames(hto.1) <- features.hto.1$V1; rownames(hto.1) <- paste0(barcodes.hto.1$V1, "-", 1); hto.1 <- t(hto.1); rm(barcodes.hto.1, features.hto.1)
# convert the barcodes to ATAC
colnames(hto.1) <- paste0(whitelist$ATAC[match(gsub("-1", "", colnames(hto.1)), whitelist$GEX)], "-", 1)

# ADT data loading 
adt.1 <- readMM("ATAC_raw_data/featurecounts-ADT1/featurecounts.mtx")
barcodes.adt.1 <- read.table("ATAC_raw_data/featurecounts-ADT1/featurecounts.barcodes.txt")
features.adt.1 <- read.table("ATAC_raw_data/featurecounts-ADT1/featurecounts.genes.txt")
colnames(adt.1) <- features.adt.1$V1; rownames(adt.1) <- paste0(barcodes.adt.1$V1, "-", 1); adt.1 <- t(adt.1); rm(barcodes.adt.1, features.adt.1)
# convert the barcodes back to ATAC 
colnames(adt.1) <- paste0(whitelist$ATAC[match(gsub("-1", "", colnames(adt.1)), whitelist$GEX)], "-", 1)

# ATAC data loading 
counts.1 <- Read10X_h5(filename = "ATAC_raw_data/ATAC1_results/outs/filtered_peak_bc_matrix.h5")
joint.1 <- intersect(colnames(counts.1), colnames(hto.1))
counts.1 <- counts.1[, joint.1]
metadata.1 <- read.csv(file = "ATAC_raw_data/ATAC1_results/outs/singlecell.csv",
                       header = TRUE, row.names = 1)

annotations <- GetGRangesFromEnsDb(ensdb = EnsDb.Mmusculus.v79)
# change to UCSC style since the data was mapped to mm10
seqlevels(annotations) <- paste0('chr', seqlevels(annotations))
genome(annotations) <- "mm10"

# SEURAT OBJECT CREATION BASED ON ATAC DATA 
chrom_assay.1 <- CreateChromatinAssay(counts = counts.1, sep = c(":", "-"), 
                                      fragments = "ATAC_raw_data/ATAC1_results/outs/fragments.tsv.gz", 
                                      min.cells = 10, min.features = 200)
bmmc.1 <- CreateSeuratObject(counts = chrom_assay.1, meta.data = metadata.1, assay = "peaks"); Annotation(bmmc.1) <- annotations
# QC matrix generation 
# compute nucleosome signal score per cell
bmmc.1 <- NucleosomeSignal(object = bmmc.1)
# compute TSS enrichment score per cell
bmmc.1 <- TSSEnrichment(object = bmmc.1, fast = FALSE)
# add blacklist ratio and fraction of reads in peaks
bmmc.1$pct_reads_in_peaks <- bmmc.1$peak_region_fragments / bmmc.1$passed_filters * 100
bmmc.1$blacklist_ratio <- bmmc.1$blacklist_region_fragments / bmmc.1$peak_region_fragments
DensityScatter(bmmc.1, x = 'nCount_peaks', y = 'TSS.enrichment', log_x = TRUE, quantiles = TRUE)
ggsave("Plots/Density_bmmc.1_QC.png")
bmmc.1$high.tss <- ifelse(bmmc.1$TSS.enrichment > 3, 'High', 'Low')
TSSPlot(bmmc.1, group.by = 'high.tss') + NoLegend() + ggtitle("BMMC.1")
ggsave("Plots/TSS_bmmc.1.png")
bmmc.1 <- subset(x = bmmc.1,  subset = nCount_peaks > 3000 & nCount_peaks < 30000 &
                   pct_reads_in_peaks > 15 & blacklist_ratio < 0.05 & nucleosome_signal < 4 & TSS.enrichment > 3)
bmmc.1
saveRDS(bmmc.1, "R_data/bmmc.1_QC.rds")

# Demultiplexing
# Add hashtag data 
# selected the coincident cells in HTO data  
joint.1 <- intersect(colnames(bmmc.1@assays$peaks), colnames(hto.1))
hto.1 <- hto.1[, joint.1]
bmmc.1[["HTO"]] <- CreateAssayObject(counts = hto.1)
bmmc.1 <- NormalizeData(bmmc.1, assay = "HTO", normalization.method = "CLR")
# Demultiplex cells based on HTO enrichment 
bmmc.1 <- HTODemux(bmmc.1, assay = "HTO", positive.quantile = 0.99)
# demultiplexing result visualization 
table(bmmc.1$HTO_classification.global)
## Doublet Negative  Singlet 
## 5375     1420     3191 
Idents(bmmc.1) <- "HTO_maxID"
RidgePlot(bmmc.1, assay = "HTO", features = rownames(bmmc.1[["HTO"]]), ncol = 4)

# Keep singlets only 
Idents(bmmc.1) <- "HTO_classification.global"
bmmc.1 <- subset(bmmc.1, idents = "Singlet", invert = FALSE)
table(bmmc.1$HTO_classification.global)
## Singlet 
## 3191
# Save the data 
saveRDS(bmmc.1, "R_data/bmmc.1_singlet.rds")

# visualization 
DefaultAssay(object = bmmc.1) <- "peaks"
bmmc.1 <- FindVariableFeatures(bmmc.1, selection.method = "mean.var.plot")
bmmc.1 <- ScaleData(bmmc.1, assay = "peaks", features = VariableFeatures(bmmc.1))
Centering and scaling data matrix
  |==========================================================================| 100%
Error in validObject(object = value) : 
  invalid class "ChromatinAssay" object: features in 'scale.data' must be in the same order as in 'data'

Has anyone encountered with the similar problem? Thank you very much!

HaotongZhang-Pitt commented 2 months ago

sessionInfo() R version 4.3.0 (2023-04-21) Platform: x86_64-pc-linux-gnu (64-bit) Running under: Red Hat Enterprise Linux

Matrix products: default BLAS: /usr/lib64/libblas.so.3.4.2 LAPACK: /usr/lib64/liblapack.so.3.4.2

locale: [1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C LC_TIME=en_US.UTF-8
[4] LC_COLLATE=en_US.UTF-8 LC_MONETARY=en_US.UTF-8 LC_MESSAGES=en_US.UTF-8
[7] LC_PAPER=en_US.UTF-8 LC_NAME=C LC_ADDRESS=C
[10] LC_TELEPHONE=C LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C

time zone: America/New_York tzcode source: system (glibc)

attached base packages: [1] stats4 stats graphics grDevices utils datasets methods base

other attached packages: [1] Matrix_1.6-5 patchwork_1.2.0 ggplot2_3.5.0
[4] EnsDb.Mmusculus.v79_2.99.0 ensembldb_2.24.0 AnnotationFilter_1.24.0
[7] GenomicFeatures_1.52.0 AnnotationDbi_1.62.0 Biobase_2.60.0
[10] GenomicRanges_1.52.0 GenomeInfoDb_1.36.0 IRanges_2.34.0
[13] S4Vectors_0.38.2 BiocGenerics_0.46.0 Seurat_5.0.3
[16] SeuratObject_5.0.1 sp_2.1-3 Signac_1.12.9007

loaded via a namespace (and not attached): [1] RcppAnnoy_0.0.22 splines_4.3.0 later_1.3.2
[4] BiocIO_1.10.0 bitops_1.0-7 filelock_1.0.2
[7] tibble_3.2.1 polyclip_1.10-6 XML_3.99-0.14
[10] fastDummies_1.7.3 lifecycle_1.0.4 globals_0.16.3
[13] lattice_0.21-8 MASS_7.3-59 magrittr_2.0.3
[16] plotly_4.10.4 yaml_2.3.8 httpuv_1.6.15
[19] sctransform_0.4.1 spam_2.10-0 spatstat.sparse_3.0-3
[22] reticulate_1.35.0 cowplot_1.1.3 pbapply_1.7-2
[25] DBI_1.1.3 RColorBrewer_1.1-3 abind_1.4-5
[28] zlibbioc_1.46.0 Rtsne_0.17 purrr_1.0.2
[31] RCurl_1.98-1.12 rappdirs_0.3.3 GenomeInfoDbData_1.2.10
[34] ggrepel_0.9.5 irlba_2.3.5.1 listenv_0.9.1
[37] spatstat.utils_3.0-4 goftest_1.2-3 RSpectra_0.16-1
[40] spatstat.random_3.2-3 fitdistrplus_1.1-11 parallelly_1.37.1
[43] DelayedArray_0.26.2 leiden_0.4.3.1 codetools_0.2-19
[46] RcppRoll_0.3.0 xml2_1.3.4 tidyselect_1.2.1
[49] matrixStats_1.2.0 BiocFileCache_2.8.0 spatstat.explore_3.2-7
[52] GenomicAlignments_1.36.0 jsonlite_1.8.8 progressr_0.14.0
[55] ggridges_0.5.6 survival_3.5-5 tools_4.3.0
[58] progress_1.2.2 ica_1.0-3 Rcpp_1.0.12
[61] glue_1.7.0 gridExtra_2.3 MatrixGenerics_1.12.0
[64] dplyr_1.1.4 withr_3.0.0 fastmap_1.1.1
[67] fansi_1.0.6 digest_0.6.35 R6_2.5.1
[70] mime_0.12 colorspace_2.1-0 scattermore_1.2
[73] tensor_1.5 spatstat.data_3.0-4 biomaRt_2.56.0
[76] RSQLite_2.3.1 utf8_1.2.4 tidyr_1.3.1
[79] generics_0.1.3 data.table_1.15.2 rtracklayer_1.60.0
[82] S4Arrays_1.0.6 prettyunits_1.1.1 httr_1.4.7
[85] htmlwidgets_1.6.4 uwot_0.1.16 pkgconfig_2.0.3
[88] gtable_0.3.4 blob_1.2.4 lmtest_0.9-40
[91] XVector_0.40.0 htmltools_0.5.8 dotCall64_1.1-1
[94] ProtGenerics_1.32.0 scales_1.3.0 png_0.1-8
[97] rstudioapi_0.14 reshape2_1.4.4 rjson_0.2.21
[100] nlme_3.1-162 curl_5.0.0 zoo_1.8-12
[103] cachem_1.0.8 stringr_1.5.1 KernSmooth_2.23-20
[106] parallel_4.3.0 miniUI_0.1.1.1 restfulr_0.0.15
[109] pillar_1.9.0 grid_4.3.0 vctrs_0.6.5
[112] RANN_2.6.1 promises_1.2.1 dbplyr_2.3.2
[115] xtable_1.8-4 cluster_2.1.4 cli_3.6.2
[118] compiler_4.3.0 Rsamtools_2.16.0 rlang_1.1.3
[121] crayon_1.5.2 future.apply_1.11.2 plyr_1.8.9
[124] stringi_1.8.3 viridisLite_0.4.2 deldir_2.0-4
[127] BiocParallel_1.34.0 munsell_0.5.0 Biostrings_2.68.0
[130] lazyeval_0.2.2 spatstat.geom_3.2-9 RcppHNSW_0.6.0
[133] hms_1.1.3 bit64_4.0.5 future_1.33.2
[136] KEGGREST_1.40.0 shiny_1.8.1 SummarizedExperiment_1.30.1 [139] ROCR_1.0-11 igraph_2.0.3 memoise_2.0.1
[142] fastmatch_1.1-4 bit_4.0.5

cdalgarno commented 2 months ago

The ScaleData() function is meant for the Seurat scRNA-seq workflow. I suggest running Signac's scATAC-seq normalization and dimensional reduction workflow from this vignette instead: https://stuartlab.org/signac/articles/pbmc_vignette.