Issue with DA in integrated data

kvshams commented 2 years ago

Hi @timoast

This is in continuation with #744. After following the merging vignette, DA is not capturing the biology and perhaps the LogFC using FindMarkers, all region showed in the same direction (>0). In the previous integration (#744) the DA regions were consistent with the previously known biology.

What would have went wrong?. Below is the complete script that I have used.

suppressMessages({
    library(Signac)
    library(Seurat)
    library(GenomeInfoDb)
    library(EnsDb.Mmusculus.v79)
    library(ggplot2)
    library(patchwork)
    set.seed(1234)
})

# read in peak sets
peaks.BM1 <- read.table(
  file = "/home/shams/SC_MOUSE_ATAC/cellranger_outs/bm1/outs/peaks.bed",
  col.names = c("chr", "start", "end"))
# read in peak sets
peaks.BM2 <- read.table(
  file = "/home/shams/SC_MOUSE_ATAC/cellranger_outs/bm2/outs/peaks.bed",
  col.names = c("chr", "start", "end"))
# convert to genomic ranges
gr.BM1 <- makeGRangesFromDataFrame(peaks.BM1)
gr.BM2 <- makeGRangesFromDataFrame(peaks.BM2)
# Create a unified set of peaks to quantify in each dataset
combined.peaks <- reduce(x = c(gr.BM1, gr.BM2))

# Filter out bad peaks based on length
peakwidths <- width(combined.peaks)
combined.peaks <- combined.peaks[peakwidths  < 10000 & peakwidths > 20]

# load metadata
md.BM1 <- read.table(
  file = "/home/shams/SC_MOUSE_ATAC/cellranger_outs/bm1/outs/singlecell.csv",
  stringsAsFactors = FALSE,
  sep = ",",
  header = TRUE,
  row.names = 1
)[-1, ] # remove the first row
# load metadata
md.BM2 <- read.table(
  file = "/home/shams/SC_MOUSE_ATAC/cellranger_outs/bm2/outs/singlecell.csv",
  stringsAsFactors = FALSE,
  sep = ",",
  header = TRUE,
  row.names = 1
)[-1, ] # remove the first row

# perform an initial filtering of low count cells
md.BM1 <- md.BM1[md.BM1$passed_filters > 500, ]
md.BM2 <- md.BM2[md.BM2$passed_filters > 500, ]

# create fragment objects
frags.BM1 <- CreateFragmentObject(
  path = "/home/shams/SC_MOUSE_ATAC/cellranger_outs/bm1/outs/fragments.tsv.gz",
  cells = rownames(md.BM1)
)

# create fragment objects
frags.BM2 <- CreateFragmentObject(
  path = "/home/shams/SC_MOUSE_ATAC/cellranger_outs/bm2/outs/fragments.tsv.gz",
  cells = rownames(md.BM2)
)

#Quantify peaks in each dataset
BM1.counts <- FeatureMatrix(
  fragments = frags.BM1,
  features = combined.peaks,
  cells = rownames(md.BM1)
)

BM2.counts <- FeatureMatrix(
  fragments = frags.BM2,
  features = combined.peaks,
  cells = rownames(md.BM2)
)

Create the objects
BM1_assay <- CreateChromatinAssay(BM1.counts, fragments = frags.BM1)
BM1 <- CreateSeuratObject(BM1_assay, assay = "ATAC")

BM2_assay <- CreateChromatinAssay(BM2.counts, fragments = frags.BM2)
BM2 <- CreateSeuratObject(BM2_assay, assay = "ATAC")
# add information to identify dataset of origin
BM1$Sample <- 'BM1'
BM2$Sample <- 'BM2'

BM1 <- AddMetaData(BM1, md.BM1)
BM2 <- AddMetaData(BM2, md.BM2)
# extract gene annotations from EnsDb
annotations <- GetGRangesFromEnsDb(ensdb = EnsDb.Mmusculus.v79)

# change to UCSC style since the data was mapped to mm10
seqlevelsStyle(annotations) <- 'UCSC'
genome(annotations) <- "mm10"

# merge all datasets, adding a cell ID to make sure cell names are unique
combined <- merge(
    x = BM1,
    y = BM2,
    add.cell.ids = c("BM1", "BM2"))
combined

# compute nucleosome signal score per cell
combined <- NucleosomeSignal(object = combined)

# compute TSS enrichment score per cell
combined <- TSSEnrichment(object = combined, fast = TRUE)
# add blacklist ratio and fraction of reads in peaks
combined$pct_reads_in_peaks <- combined$peak_region_fragments / combined$passed_filters * 100
combined$blacklist_ratio <- combined$blacklist_region_fragments / combined$peak_region_fragments

QC_combined_filtred <- subset(
  x = combined,
  subset = peak_region_fragments > 3000 &
    peak_region_fragments < 100000 &
    pct_reads_in_peaks > 40 &
    blacklist_ratio < 0.025 &
    nucleosome_signal < 4 &
    TSS.enrichment > 5
)
QC_combined_filtred

QC_combined_filtred <- RunTFIDF(QC_combined_filtred)
QC_combined_filtred <- FindTopFeatures(QC_combined_filtred, min.cutoff = 20)
QC_combined_filtred <- RunSVD(QC_combined_filtred)
QC_combined_filtred <- RunUMAP(QC_combined_filtred, dims = 2:50, reduction = 'lsi')

Refdata <- readRDS("/home/shams/SC_MOUSE_ATAC/GSE128423.bm_subset_singlet_filtred_annotated_v2.rds")
DefaultAssay(Refdata) <- "RNA"
Refdata <- FindVariableFeatures(
  object = Refdata,
  nfeatures = 5000)

DefaultAssay(QC_combined_filtred) <- "RNA"
transfer.anchors <- FindTransferAnchors(
  reference = Refdata,
  query = QC_combined_filtred,
  reduction = 'cca',
  dims = 1:40)

predicted.labels <- TransferData(
  anchorset = transfer.anchors,
  refdata = Refdata$Cell_Labels_v2,
  weight.reduction = QC_combined_filtred[['lsi']],
  dims = 2:30
)
QC_combined_filtred <- AddMetaData(object = QC_combined_filtred, metadata = predicted.labels)

QC_combined_filtred$Cluster_Annotation_v1 <- QC_combined_filtred$predicted.id
QC_combined_filtred$Cluster_Annotation_v1_forMotifEnrichment <- paste(QC_combined_filtred$Sample,
                                                             as.character(QC_combined_filtred$Cluster_Annotation_v1)
                                                                    ,sep=":")
Idents(QC_combined_filtred) <- QC_combined_filtred$Cluster_Annotation_v1_forMotifEnrichment

# change back to working with peaks instead of gene activities
DefaultAssay(QC_combined_filtred) <- 'ATAC'

da_peaks_ProB_Cell <- FindMarkers(
  object = QC_combined_filtred,
  ident.1 = "BM1:Pro-B",
  ident.2 = "BM2:Pro-B",
  min.pct = 0.05,
  test.use = 'LR',
  latent.vars = 'peak_region_fragments'
)

head(da_peaks_ProB_Cell)

hist(da_peaks_ProB_Cell$avg_log2FC)


R version 4.1.0 (2021-05-18)
Platform: x86_64-pc-linux-gnu (64-bit)
Running under: Ubuntu 20.04.2 LTS

Matrix products: default
BLAS:   /usr/lib/x86_64-linux-gnu/blas/libblas.so.3.9.0
LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.9.0

locale:
 [1] LC_CTYPE=C.UTF-8       LC_NUMERIC=C           LC_TIME=C.UTF-8       
 [4] LC_COLLATE=C.UTF-8     LC_MONETARY=C.UTF-8    LC_MESSAGES=C.UTF-8   
 [7] LC_PAPER=C.UTF-8       LC_NAME=C              LC_ADDRESS=C          
[10] LC_TELEPHONE=C         LC_MEASUREMENT=C.UTF-8 LC_IDENTIFICATION=C   

attached base packages:
[1] stats4    parallel  stats     graphics  grDevices utils     datasets 
[8] methods   base     

other attached packages:
 [1] future_1.21.0              patchwork_1.1.1           
 [3] ggplot2_3.3.5              EnsDb.Mmusculus.v79_2.99.0
 [5] ensembldb_2.16.3           AnnotationFilter_1.16.0   
 [7] GenomicFeatures_1.44.0     AnnotationDbi_1.54.1      
 [9] Biobase_2.52.0             GenomicRanges_1.44.0      
[11] GenomeInfoDb_1.28.1        IRanges_2.26.0            
[13] S4Vectors_0.30.0           BiocGenerics_0.38.0       
[15] SeuratObject_4.0.2         Seurat_4.0.3.9013         
[17] Signac_1.3.0              

loaded via a namespace (and not attached):
  [1] utf8_1.2.2                  reticulate_1.20            
  [3] tidyselect_1.1.1            RSQLite_2.2.7              
  [5] htmlwidgets_1.5.3           grid_4.1.0                 
  [7] docopt_0.7.1                BiocParallel_1.26.1        
  [9] Rtsne_0.15                  munsell_0.5.0              
 [11] codetools_0.2-18            ica_1.0-2                  
 [13] pbdZMQ_0.3-5                miniUI_0.1.1.1             
 [15] withr_2.4.2                 colorspace_2.0-2           
 [17] filelock_1.0.2              knitr_1.33                 
 [19] uuid_0.1-4                  rstudioapi_0.13            
 [21] ROCR_1.0-11                 tensor_1.5                 
 [23] listenv_0.8.0               labeling_0.4.2             
 [25] MatrixGenerics_1.4.0        slam_0.1-48                
 [27] repr_1.1.3                  GenomeInfoDbData_1.2.6     
 [29] polyclip_1.10-0             bit64_4.0.5                
 [31] farver_2.1.0                parallelly_1.27.0          
 [33] vctrs_0.3.8                 generics_0.1.0             
 [35] xfun_0.24                   biovizBase_1.40.0          
 [37] BiocFileCache_2.0.0         lsa_0.73.2                 
 [39] ggseqlogo_0.1               R6_2.5.0                   
 [41] bitops_1.0-7                spatstat.utils_2.2-0       
 [43] cachem_1.0.5                DelayedArray_0.18.0        
 [45] assertthat_0.2.1            promises_1.2.0.1           
 [47] BiocIO_1.2.0                scales_1.1.1               
 [49] nnet_7.3-16                 gtable_0.3.0               
 [51] Cairo_1.5-12.2              globals_0.14.0             
 [53] goftest_1.2-2               rlang_0.4.11               
 [55] RcppRoll_0.3.0              splines_4.1.0              
 [57] rtracklayer_1.52.0          lazyeval_0.2.2             
 [59] dichromat_2.0-0             checkmate_2.0.0            
 [61] spatstat.geom_2.2-2         yaml_2.2.1                 
 [63] reshape2_1.4.4              abind_1.4-5                
 [65] backports_1.2.1             httpuv_1.6.1               
 [67] Hmisc_4.5-0                 tools_4.1.0                
 [69] ellipsis_0.3.2              spatstat.core_2.3-0        
 [71] RColorBrewer_1.1-2          ggridges_0.5.3             
 [73] Rcpp_1.0.7                  plyr_1.8.6                 
 [75] base64enc_0.1-3             progress_1.2.2             
 [77] zlibbioc_1.38.0             purrr_0.3.4                
 [79] RCurl_1.98-1.3              prettyunits_1.1.1          
 [81] rpart_4.1-15                deldir_0.2-10              
 [83] pbapply_1.4-3               cowplot_1.1.1              
 [85] zoo_1.8-9                   SummarizedExperiment_1.22.0
 [87] ggrepel_0.9.1               cluster_2.1.2              
 [89] magrittr_2.0.1              RSpectra_0.16-0            
 [91] data.table_1.14.0           scattermore_0.7            
 [93] lmtest_0.9-38               RANN_2.6.1                 
 [95] SnowballC_0.7.0             ProtGenerics_1.24.0        
 [97] fitdistrplus_1.1-5          matrixStats_0.60.0         
 [99] hms_1.1.0                   mime_0.11                  
[101] evaluate_0.14               xtable_1.8-4               
[103] XML_3.99-0.6                jpeg_0.1-9                 
[105] sparsesvd_0.2               gridExtra_2.3              
[107] compiler_4.1.0              biomaRt_2.48.2             
[109] tibble_3.1.3                KernSmooth_2.23-20         
[111] crayon_1.4.1                htmltools_0.5.1.1          
[113] mgcv_1.8-36                 later_1.2.0                
[115] Formula_1.2-4               tidyr_1.1.3                
[117] DBI_1.1.1                   tweenr_1.0.2               
[119] dbplyr_2.1.1                MASS_7.3-54                
[121] rappdirs_0.3.3              Matrix_1.3-4               
[123] igraph_1.2.6                pkgconfig_2.0.3            
[125] GenomicAlignments_1.28.0    foreign_0.8-81             
[127] IRdisplay_1.0               plotly_4.9.4.1             
[129] spatstat.sparse_2.0-0       xml2_1.3.2                 
[131] XVector_0.32.0              VariantAnnotation_1.38.0   
[133] stringr_1.4.0               digest_0.6.27              
[135] sctransform_0.3.2           RcppAnnoy_0.0.19           
[137] spatstat.data_2.1-0         Biostrings_2.60.1          
[139] leiden_0.3.9                fastmatch_1.1-3            
[141] htmlTable_2.2.1             uwot_0.1.10                
[143] restfulr_0.0.13             curl_4.3.2                 
[145] shiny_1.6.0                 Rsamtools_2.8.0            
[147] rjson_0.2.20                lifecycle_1.0.0            
[149] nlme_3.1-152                jsonlite_1.7.2             
[151] BSgenome_1.60.0             viridisLite_0.4.0          
[153] fansi_0.5.0                 pillar_1.6.2               
[155] lattice_0.20-44             KEGGREST_1.32.0            
[157] fastmap_1.1.0               httr_1.4.2                 
[159] survival_3.2-11             glue_1.4.2                 
[161] qlcMatrix_0.9.7             png_0.1-7                  
[163] bit_4.0.4                   ggforce_0.3.3              
[165] stringi_1.7.3               blob_1.2.2                 
[167] latticeExtra_0.6-29         memoise_2.0.0              
[169] IRkernel_1.2                dplyr_1.0.7                
[171] irlba_2.3.3                 future.apply_1.7.0         
```

timoast commented 2 years ago

If you lower the logfc.threshold (eg, logfc.threshold=0.01), can you identify any peaks that change in the other direction (<0?)

kvshams commented 2 years ago

Yes, I can see that. logfc is highly skewed towards one direction >0 (Both the data has comparable sequencing depth)

timoast commented 2 years ago

Ok, I'm not sure if you're suggesting there's something wrong in the function or just a general question about why this could happen? It seems like this is not a bug in the FindMarkers() function, especially if you can identify peaks with negative logFC when lowering the threshold. Are there any clearly differentially accessible regions that are being missed by the test?

timoast commented 2 years ago

Any update here?

stuart-lab / signac

Issue with DA in integrated data #754