mph270 commented 1 month ago

Hi! I am using your functions to merge cellbender and cellranger h5 files to assess background removal. I have a snRNAseq experiment from 4x WT and 4x KO samples. Everything seems to work smoothly, except for some reason one of my animals (KO2) looks very different (many cells with very high % mitochondrial genes) in the QC_Plots_Mito function when looking at the merged seurat object. When I process this animal individually, the QC_Plots_Mito looks much more similar to the % mitochondrial genes of the other animals (lower %). I repeated cellbender with custom "expected-cells" and "total-droplets-included" based on the barcode ranked plot for each sample and got the same result. Could you help me figure out why the % mitochondrial genes are plotting differently for this animal depending on whether the data are merged first?

I do get an error message when running this plot: "Warning: Default search for "data" layer in "RNA" assay yielded no results; utilizing "counts" layer instead." but the plot is still generated.

N.B. in attached image file, the individual KO2 QC_Plots_Mito is on the right (plotted as SeuratProject).

Thank you so much for your help!

Done.

CellBender Functionality & Plotting using scCustomize for MERGED data (4x WT and 4x KO)

library(ggplot2) library(dplyr) library(magrittr) library(patchwork) library(viridis) library(Seurat) library(scCustomize) library(qs) library(future)

cell_bender_merged <- Read_CellBender_h5_Multi_File(data_dir = "/d2/studies/Molly/snRNAseq/cellbender_second_pass", custom_name = "_cellbender_filtered.h5", sample_names = c("WT1", "WT2", "WT3", "WT4", "KO1", "KO2", "KO3", "KO4"), merge = TRUE)

cell_ranger_merged <- Read10X_h5_GEO(data_dir = "/d2/studies/Molly/snRNAseq/cellranger_h5files", sample_names = c("WT1", "WT2", "WT3", "WT4", "KO1", "KO2", "KO3", "KO4"), shared_suffix = "_filtered_feature_bc_matrix.h5", merge = TRUE)

dual_seurat <- Create_CellBender_Merged_Seurat(raw_cell_bender_matrix = cell_bender_merged, raw_counts_matrix = cell_ranger_merged, raw_assay_name = "RAW")

Add a new column 'genotype' to the Seurat object's metadata (1 and 2 refer to the first two characters/letters of "WT" and "KO")

dual_seurat$genotype <- substr(dual_seurat$orig.ident, 1, 2)

Verify that the new column was added and contains the correct values

table(dual_seurat$genotype)

Reorder levels of seurat object so WT first KO second

Example for reordering orig.ident

dual_seurat$orig.ident <- factor(dual_seurat$orig.ident, levels = c("WT1", "WT2", "WT3", "WT4", "KO1", "KO2", "KO3", "KO4"))

Example for reordering genotype

dual_seurat$genotype <- factor(dual_seurat$genotype, levels = c("WT", "KO"))

dual_seurat <- Add_CellBender_Diff(seurat_object = dual_seurat, raw_assay_name = "RAW", cell_bender_assay_name = "RNA")

head(dual_seurat@meta.data, 5)

dual_seurat <- Add_Mito_Ribo(object = dual_seurat, species = "Mouse")

dual_seurat <- Add_Cell_Complexity(object = dual_seurat)

median_stats <- Median_Stats(seurat_object = dual_seurat, group_by_var = "orig.ident", median_var = c("nCount_RAW", "nFeature_RAW", "nCount_Diff", "nFeature_Diff")) median_stats #view results

write.csv(median_stats, file = "cellbender_median_stats.csv", row.names = FALSE)

feature_diff <- CellBender_Feature_Diff(seurat_object = dual_seurat, raw_assay = "RAW", cell_bender_assay = "RNA") feature_diff #view results write.csv(feature_diff, file = "cellbender_feature_diff.csv", row.names = TRUE)

p1 <- CellBender_Diff_Plot(feature_diff_df = feature_diff) p2 <- CellBender_Diff_Plot(feature_diff_df = feature_diff, pct_diff_threshold = 50) p3 <- CellBender_Diff_Plot(feature_diff_df = feature_diff, num_features = 500, pct_diff_threshold = NULL) p4 <- CellBender_Diff_Plot(feature_diff_df = feature_diff, num_labels = 10) p5 <- CellBender_Diff_Plot(feature_diff_df = feature_diff, label = F) p6 <- CellBender_Diff_Plot(feature_diff_df = feature_diff, custom_labels = "Rpl32")

combined_plot <- wrap_plots(p1, p2, p3, p4, p5, p6, ncol = 2)

p7 <- QC_Plots_Genes(seurat_object = dual_seurat, low_cutoff = 500, high_cutoff = 8000) + scale_x_discrete(limits = levels(dual_seurat$orig.ident)) p8 <- QC_Plots_UMIs(seurat_object = dual_seurat, low_cutoff = 1000, high_cutoff = 80000) + scale_x_discrete(limits = levels(dual_seurat$orig.ident)) p9 <- QC_Plots_Mito(seurat_object = dual_seurat, high_cutoff = 5) + scale_x_discrete(limits = levels(dual_seurat$orig.ident)) p10 <- QC_Plots_Complexity(seurat_object = dual_seurat, high_cutoff = 0.8) + scale_x_discrete(limits = levels(dual_seurat$orig.ident))

wrap_plots(p7, p8, p9, p10, ncol = 2)

combined_cellbender_QC_plots <- wrap_plots(p7, p8, p9, p10, ncol = 2)

p11 <- QC_Plots_Genes(seurat_object = dual_seurat, low_cutoff = 500, high_cutoff = 8000, plot_median = TRUE, pt.size = 0) + scale_x_discrete(limits = levels(dual_seurat$orig.ident)) p12 <- QC_Plots_UMIs(seurat_object = dual_seurat, low_cutoff = 1000, high_cutoff = 80000, plot_median = TRUE, pt.size = 0, y_axis_log = TRUE) + scale_x_discrete(limits = levels(dual_seurat$orig.ident)) p13 <- QC_Plots_Mito(seurat_object = dual_seurat, high_cutoff = 5, plot_median = TRUE, pt.size = 0) + scale_x_discrete(limits = levels(dual_seurat$orig.ident)) p14 <- QC_Plots_Complexity(seurat_object = dual_seurat, high_cutoff = 0.8, , plot_median = TRUE, pt.size = 0) + scale_x_discrete(limits = levels(dual_seurat$orig.ident))

CellBender Functionality & Plotting using scCustomize for INDIVIDUAL data (KO2)

cellbender_KO2 <- Read_CellBender_h5_Mat(file_name = "/d2/studies/Molly/snRNAseq/cellbender_second_pass/KO2_cellbender_filtered.h5")

cellranger_KO2 <- Read10X_h5("/d2/studies/Molly/snRNAseq/cellranger_h5files/KO2_filtered_feature_bc_matrix.h5")

dual_KO2 <- Create_CellBender_Merged_Seurat(raw_cell_bender_matrix = cellbender_KO2, raw_counts_matrix = cellranger_KO2, raw_assay_name = "RAW")

dual_KO2 <- Add_CellBender_Diff(seurat_object = dual_KO2, raw_assay_name = "RAW", cell_bender_assay_name = "RNA")

head(dual_KO2@meta.data, 5)

dual_KO2 <- Add_Mito_Ribo(object = dual_KO2, species = "Mouse")

dual_KO2 <- Add_Cell_Complexity(object = dual_KO2)

median_stats <- Median_Stats(seurat_object = dual_KO2, median_var = c("nCount_RAW", "nFeature_RAW", "nCount_Diff", "nFeature_Diff")) median_stats #view results

setwd("/d2/studies/Molly/snRNAseq/output/cellbender2")

write.csv(median_stats, file = "cellbender_KO2_median_stats.csv", row.names = FALSE)

p16 <- QC_Plots_Genes(seurat_object = dual_KO2, low_cutoff = 500, high_cutoff = 8000) p17 <- QC_Plots_UMIs(seurat_object = dual_KO2, low_cutoff = 1000, high_cutoff = 80000) p18 <- QC_Plots_Mito(seurat_object = dual_KO2, high_cutoff = 5) p19 <- QC_Plots_Complexity(seurat_object = dual_KO2, high_cutoff = 0.8)

wrap_plots(p16, p17, p18, p19, ncol = 4)

wrap_plots(p9, p18) #this is image attached below

Rplot

sessionInfo() R version 4.4.1 (2024-06-14) Platform: x86_64-pc-linux-gnu Running under: Ubuntu 22.04.5 LTS

Matrix products: default BLAS: /usr/lib/x86_64-linux-gnu/blas/libblas.so.3.10.0 LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.10.0

locale: [1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C LC_TIME=en_US.UTF-8 LC_COLLATE=en_US.UTF-8
[5] LC_MONETARY=en_US.UTF-8 LC_MESSAGES=en_US.UTF-8 LC_PAPER=en_US.UTF-8 LC_NAME=C
[9] LC_ADDRESS=C LC_TELEPHONE=C LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C

time zone: US/Eastern tzcode source: system (glibc)

attached base packages: [1] stats graphics grDevices utils datasets methods base

other attached packages: [1] qs_0.27.2 scCustomize_2.1.2 Seurat_5.1.0 SeuratObject_5.0.2 sp_2.1-4
[6] viridis_0.6.5 viridisLite_0.4.2 patchwork_1.3.0 magrittr_2.0.3 dplyr_1.1.4
[11] ggplot2_3.5.1

loaded via a [1] RColorBrewer_1.1-3 [5] spatstat.utils_3.1-0 [9] GlobalOptions_0.1.2 [13] paletteer_1.6.0 [17] sctransform_0.4.1 [21] ica_1.0-3 [25] lubridate_1.9.3 [29] pkgconfig_2.0.3 [33] fitdistrplus_1.2-1 [37] digest_0.6.37 [41] prismatic_1.1.2 [45] labeling_0.4.3 [49] timechange_0.3.0 [53] compiler_4.4.1 [57] MASS_7.3-61 [61] beeswarm_0.4.0 [65] glue_1.8.0 [69] Rtsne_0.17 [73] hdf5r_1.3.11 [77] RApiSerialize_0.1.4 [81] spatstat.geom_3.3-3 [85] pillar_1.9.0 [89] ggprism_1.0.5 [93] lattice_0.22-5 [97] tidyselect_1.2.1 [101] svglite_2.1.3 [105] lazyeval_0.2.2 [109] RcppParallel_5.1.9 [113] reticulate_1.39.0 [117] spatstat.random_3.3-2 [121] parallel_4.4.1 [125] ggridges_0.5.6 [129] cowplot_1.1.3 namespace (and not attached): rstudioapi_0.16.0 jsonlite_1.8.9 shape_1.4.6.1
ggbeeswarm_0.7.2 farver_2.1.2 ragg_1.3.3
vctrs_0.6.5 ROCR_1.0-11 spatstat.explore_3.3-2 janitor_2.2.0 htmltools_0.5.8.1 forcats_1.0.0
parallelly_1.38.0 KernSmooth_2.23-24 htmlwidgets_1.6.4
plyr_1.8.9 plotly_4.10.4 zoo_1.8-12
igraph_2.0.3 mime_0.12 lifecycle_1.0.4
Matrix_1.6-5 R6_2.5.1 fastmap_1.2.0
future_1.34.0 shiny_1.9.1 snakecase_0.11.1
colorspace_2.1-1 rematch2_2.1.2 tensor_1.5
RSpectra_0.16-2 irlba_2.3.5.1 textshaping_0.4.0
progressr_0.14.0 fansi_1.0.6 spatstat.sparse_3.1-0 httr_1.4.7 polyclip_1.10-7 abind_1.4-8
bit64_4.5.2 withr_3.0.1 fastDummies_1.7.4
tools_4.4.1 vipor_0.4.7 lmtest_0.9-40
httpuv_1.6.15 future.apply_1.11.2 goftest_1.2-3
nlme_3.1-165 promises_1.3.0 grid_4.4.1
cluster_2.1.6 reshape2_1.4.4 generics_0.1.3
gtable_0.3.5 spatstat.data_3.1-2 tidyr_1.3.1
data.table_1.16.0 stringfish_0.16.0 utf8_1.2.4
RcppAnnoy_0.0.22 ggrepel_0.9.6 RANN_2.6.2
stringr_1.5.1 spam_2.10-0 RcppHNSW_0.6.0
later_1.3.2 circlize_0.4.16 splines_4.4.1
bit_4.5.0 survival_3.7-0 deldir_2.0-4
miniUI_0.1.1.1 pbapply_1.7-2 gridExtra_2.3
scattermore_1.2 matrixStats_1.4.1 stringi_1.8.4
codetools_0.2-19 tibble_3.2.1 cli_3.6.3
uwot_0.2.2 systemfonts_1.1.0 xtable_1.8-4
munsell_0.5.1 Rcpp_1.0.13 globals_0.16.3
png_0.1-8 ggrastr_1.0.2 spatstat.univar_3.0-1 dotCall64_1.1-1 listenv_0.9.1 scales_1.3.0
leiden_0.4.3.1 purrr_1.0.2 rlang_1.1.4

samuel-marsh commented 1 month ago

Hi @mph270,

Sorry for the delay getting back to you. So my current best guess if that the issue here is likely related to use of multiple datasets vs 1 dataset. By default Create_CellBender_Merged_Seurat sets min_cells = 5 and min_features = 200. While min_features will filter the same cells regardless but which genes are kept vs removed based on min_cells will be different.

Can you post the plots if you re-run your above code but set min_cells = 0 and min_features = 0?

Thanks, Sam

mph270 commented 1 month ago

Hi Sam,

Thanks so much for your reply. I'm attaching the image of the plot after running my code but setting min_cells and min_features to 0. It looks much the same as before, so not sure if that is the issue.

Thanks again, Molly Mito repeat

samuel-marsh commented 4 weeks ago

Hi Molly,

Are you able to share the raw files for these samples? If needed I can provide code to replace and randomize non-mito gene names to anonymize the data. If so please send me email at samuel.marsh@childrens.harvard.edu and I can share links to upload data.

Best, Sam

mph270 commented 3 weeks ago

Hi Sam,

Yes, I'd be happy to share the raw files. I sent an email to you from my work email. Thanks so much for your help!

Best, Molly

samuel-marsh commented 1 week ago

Hi Molly,

So I have figured out this issue (will work on other one next). The issue here relates to supplying sample_names in the Read_CellBender_h5_Multi_File function. Unless a value is provide to sample_list the function will read all files from the directory and will read the files in alphabetical order from the directory (just how list.files() operates in R and what the function uses under the hood to detect the files).

I will be updating the documentation for function manual to add a warning about this and apologize for confusion.

So when you read in the files it reads the KO files first but is naming them WT. That is why there is a mismatch when loading just the single KO2 file alone. The sample with very high mito in your plots is actually WT2.

So there are two things that can be done to solve this 1) when running the function you can make sure whatever values are supplied to sample_names are in the same order as files are listed in directory alphabetically. 2) In addition to using the sample_names parameter you can use the sample_list parameter. This parameter is typically used to read in some but not all of files in directory. But key is that when this parameter is supplied the files will be read in the order supplied. So then sample_names just needs to match sample_list and not how the files are ordered alphabetically`

Code here works:

# match sample_names alphabetically
cell_bender_merged <- Read_CellBender_h5_Multi_File(data_dir = "~/Downloads/GitHub_208_210/cellbender_h5files/", custom_name = "_cellbender_filtered.h5", sample_names = c( "KO1", "KO2", "KO3", "KO4", "WT1", "WT2", "WT3", "WT4"))

# add sample_list parameter too
cell_bender_merged <- Read_CellBender_h5_Multi_File(data_dir = "~/Downloads/GitHub_208_210/cellbender_h5files/", custom_name = "_cellbender_filtered.h5", sample_names = c("WT1", "WT2", "WT3", "WT4", "KO1", "KO2", "KO3", "KO4"), sample_list = c("WT1", "WT2", "WT3", "WT4", "KO1", "KO2", "KO3", "KO4"))

As additional helper note when adding sample level information like genotype to cell level meta data in Seurat object scCustomize has easy helper function (especially useful when adding multiple sample level columns). You just need to create a data.frame or tibble (or read in a csv or excel file as data.frame) that contains one column which matches "orig.ident" in seurat object (1 row per unique value) and remaining columns containing new info. Then you use Add_Sample_Meta to quickly add to object meta.data.

sample_meta <- data.frame("sample_id" = c("WT1", "WT2", "WT3", "WT4", "KO1", "KO2", "KO3", "KO4"),
                          "genotype" = c("WT", "WT", "WT", "WT", "KO", "KO", "KO", "KO"))

obj <- Add_Sample_Meta(seurat_object = obj, meta_data = sample_meta, join_by_seurat = "orig.ident", join_by_meta = "sample_id")

In your case above since you are just adding single column using substr is easy but just wanted to give fyi that is helpful when adding large amounts of sample level meta info :).

If you still run into any issues with creating merged objects let me know and happy to reopen the issue.

Best, Sam

mph270 commented 1 week ago

Hi Sam,

Thank you so much for taking the time to look into this! I really really appreciate it! I will update my code accordingly. My data actually make much more sense now—we were confused about the directionality of the change in expression of a gene as compared to previous sequencing experiments! It’s a relief that the finding holds up.

Best, Molly

On Nov 12, 2024, at 9:47 AM, Samuel Marsh @.***> wrote:

USE CAUTION: External Message.

Hi Molly,

So I have figured out this issue (will work on other one next). The issue here relates to supplying sample_names in the Read_CellBender_h5_Multi_File function. Unless a value is provide to sample_list the function will read all files from the directory and will read the files in alphabetical order from the directory (just how list.files() operates in R and what the function uses under the hood to detect the files).

I will be updating the documentation for function manual to add a warning about this and apologize for confusion.

So when you read in the files it reads the KO files first but is naming them WT. That is why there is a mismatch when loading just the single KO2 file alone. The sample with very high mito in your plots is actually WT2.

So there are two things that can be done to solve this 1) when running the function you can make sure whatever values are supplied to sample_names are in the same order as files are listed in directory alphabetically. 2) In addition to using the sample_names parameter you can use the sample_list parameter. This parameter is typically used to read in some but not all of files in directory. But key is that when this parameter is supplied the files will be read in the order supplied. So then sample_names just needs to match sample_list and not how the files are ordered alphabetically`

Code here works:

match sample_names alphabetically

cell_bender_merged <- Read_CellBender_h5_Multi_File(data_dir = "~/Downloads/GitHub_208_210/cellbender_h5files/", custom_name = "_cellbender_filtered.h5", sample_names = c( "KO1", "KO2", "KO3", "KO4", "WT1", "WT2", "WT3", "WT4"))

add sample_list parameter too

cell_bender_merged <- Read_CellBender_h5_Multi_File(data_dir = "~/Downloads/GitHub_208_210/cellbender_h5files/", custom_name = "_cellbender_filtered.h5", sample_names = c("WT1", "WT2", "WT3", "WT4", "KO1", "KO2", "KO3", "KO4"), sample_list = c("WT1", "WT2", "WT3", "WT4", "KO1", "KO2", "KO3", "KO4"))

As additional helper note when adding sample level information like genotype to cell level meta data in Seurat object scCustomize has easy helper function (especially useful when adding multiple sample level columns). You just need to create a data.frame or tibble (or read in a csv or excel file as data.frame) that contains one column which matches "orig.ident" in seurat object (1 row per unique value) and remaining columns containing new info. Then you use Add_Sample_Meta to quickly add to object meta.data.

sample_meta <- data.frame("sample_id" = c("WT1", "WT2", "WT3", "WT4", "KO1", "KO2", "KO3", "KO4"), "genotype" = c("WT", "WT", "WT", "WT", "KO", "KO", "KO", "KO"))

obj <- Add_Sample_Meta(seurat_object = obj, meta_data = sample_meta, join_by_seurat = "orig.ident", join_by_meta = "sample_id")

In your case above since you are just adding single column using substr is easy but just wanted to give fyi that is helpful when adding large amounts of sample level meta info :).

If you still run into any issues with creating merged objects let me know and happy to reopen the issue.

Best, Sam

— Reply to this email directly, view it on GitHubhttps://urldefense.proofpoint.com/v2/url?u=https-3A__github.com_samuel-2Dmarsh_scCustomize_issues_208-23issuecomment-2D2470733334&d=DwMFaQ&c=shNJtf5dKgNcPZ6Yh64b-ALLUrcfR-4CCQkZVKC8w3o&r=7sT75vjcFYWbvMUHdmqKmdUdxrGdX1G-vXydaP_SfC0&m=Eh2myA0fOo-PnmVMKFV7q7XVcgPHwlxuzPTy9ekvIaiGaYtgat3zn1-H5Ipf_P12&s=o4KzbL0jS5GgkzHoFTjFAJz2SoAG5ml8YunHfWj1fmE&e=, or unsubscribehttps://urldefense.proofpoint.com/v2/url?u=https-3A__github.com_notifications_unsubscribe-2Dauth_BMDK7KSQVSAE7JYUDNFHRCD2AIIHXAVCNFSM6AAAAABP5TV22KVHI2DSMVQWIX3LMV43OSLTON2WKQ3PNVWWK3TUHMZDINZQG4ZTGMZTGQ&d=DwMFaQ&c=shNJtf5dKgNcPZ6Yh64b-ALLUrcfR-4CCQkZVKC8w3o&r=7sT75vjcFYWbvMUHdmqKmdUdxrGdX1G-vXydaP_SfC0&m=Eh2myA0fOo-PnmVMKFV7q7XVcgPHwlxuzPTy9ekvIaiGaYtgat3zn1-H5Ipf_P12&s=9YZ_ivdLqHUGYd52RxbWgTY4_9kvbQ_q5j8axoy8e2U&e=. You are receiving this because you were mentioned.Message ID: @.***>

samuel-marsh commented 1 week ago

No problem!! Happy to help! I can imagine it was shock to see everything reversed lol.

Best, Sam

samuel-marsh / scCustomize

QC_Plots_Mito for merged dual cellbender-cellranger seurat object does not match that for individual cellbender-cellranger seurat object #208

CellBender Functionality & Plotting using scCustomize for MERGED data (4x WT and 4x KO)

Add a new column 'genotype' to the Seurat object's metadata (1 and 2 refer to the first two characters/letters of "WT" and "KO")

Verify that the new column was added and contains the correct values

Reorder levels of seurat object so WT first KO second

Example for reordering orig.ident

Example for reordering genotype

CellBender Functionality & Plotting using scCustomize for INDIVIDUAL data (KO2)

match sample_names alphabetically

add sample_list parameter too