UCSF-DSCOLAB / data_processing_pipelines

A repository to store the existing pipelines to process the various CoLabs datasets
0 stars 1 forks source link

sc_seq pipeline diagnostic plots for nCount_ADT are unusable #72

Open dtm2451 opened 5 months ago

dtm2451 commented 5 months ago

Described fully within, and will be fixed by, #71.

Only affects CITEseq datasets.

In the meantime, users can use the script below to generate "_diagnostic_plots_pre__zoomed.pdf" versions of diagnostic plots.

# Script for remaking diagnostic plots of libraries for setting QC cutoffs
# Author: Dan Bunis
# Updated: 3/26/24

library(Seurat)
library(yaml)
library(stringr)

######### PARAMETERS: Adjust these ############

# Adjust to the value your 'config.json' input: 'project_dir'
PROJECT_DIR <- "/krummellab/data1/immunox/PROJ1"

# You should not need to change this one.
SOBJ_DIR <- paste0(PROJECT_DIR, "/data/single_cell_GEX/processed/"

# Adjust the 'pattern' here to match your desired target libraries. 
LIBRARIES <- list.files(path = SOBJ_DIR, pattern = "PROJ1-POOL-GC(\\d)+-SCG(\\d)+")

# Adjust toward the targets of your 'config.json' inputs: 'default_qc_cuts_dir' and 'default_qc_cuts_file'
DEFAULT_PARAM_CSV <- "/krummellab/data1/immunox/PROJ1/data_processing_pipelines/single_cell_RNAseq/example-inputs/default_qc_cuts.csv"

######### END: PARAMETERS ############

timestamped_message <- function(...) {
  cat("[ ", format(Sys.time()), " ] ", ..., "\n", sep = "")
}

## scatterhist code from pipeline code https://github.com/UCSF-DSCOLAB/data_processing_pipelines/blob/ef/add_vdj_v2/single_cell_RNAseq/bin/seurat_utils.R
# MODIFIED only to allow zooming in on nCount_AD
scatterhist <- function(x, y, sobj, params) {
    x_upper = as.numeric(params[paste0(x,".upper"),])
    x_lower = as.numeric(params[paste0(x,".lower"),])
    y_upper = as.numeric(params[paste0(y,".upper"),])
    y_lower = as.numeric(params[paste0(y,".lower"),])
    xy_data = FetchData(sobj, vars=c(x, y))
    filter_cells = xy_data[,1] <= x_upper & 
        xy_data[,1] >= x_lower & 
        xy_data[,2] <= y_upper & 
        xy_data[,2] >= y_lower
    filter_cell_percent = round( sum(filter_cells) / ncol(sobj) ,3) * 100
    p = ggplot(sobj@meta.data, aes(x=!!sym(x), y=!!sym(y))) +
        geom_point(size=0.1,alpha=0.1) +
        geom_hex(bins=100) +
        scale_fill_distiller(palette = "RdYlBu") +
        theme_classic() +
        geom_vline(xintercept = x_upper) +
        geom_vline(xintercept = x_lower) +
        geom_hline(yintercept = y_upper) +
        geom_hline(yintercept = y_lower) + 
        geom_rect(aes(xmin=x_lower, xmax=x_upper, ymin=y_lower, ymax=y_upper), color="red", alpha=0) +
        geom_text(data = data.frame(x=x_lower, y=y_upper,text=paste0(filter_cell_percent, "%")), aes(x=x,y=y,label=text), hjust=0, vjust=1, color="darkred", size=8) +
        # added + above, and the below
        if (x == "nCount_ADT") {
            xlim(NA, 25000)
        } else if (y == "nCount_ADT") {
            ylim(NA, 25000)
        }
    # end add

    return( ggExtra::ggMarginal(p, type = "histogram", bins=50) )
}
### Also copying additional many plot wrapper code...
make_plots <- function(sobj, params) {
    plot_list = list()
    plot_list[[length(plot_list)+1]] = scatterhist("percent.ribo","percent.mt",sobj,params)
    plot_list[[length(plot_list)+1]] = scatterhist("nCount_RNA","percent.mt",sobj,params)
    plot_list[[length(plot_list)+1]] = scatterhist("nFeature_RNA","percent.mt",sobj,params)
    plot_list[[length(plot_list)+1]] = scatterhist("nCount_RNA","percent.ribo",sobj,params)
    plot_list[[length(plot_list)+1]] = scatterhist("nFeature_RNA","percent.ribo",sobj,params)
    plot_list[[length(plot_list)+1]] = scatterhist("nCount_RNA","nFeature_RNA",sobj,params)
    ### Removing this conditional as it's always TRUE for us!
    # if (adt.present){
        plot_list[[length(plot_list)+1]] = scatterhist("nCount_ADT","percent.mt",sobj,params)
        plot_list[[length(plot_list)+1]] = scatterhist("nCount_ADT","percent.ribo",sobj,params)
        plot_list[[length(plot_list)+1]] = scatterhist("nCount_RNA","nCount_ADT",sobj,params)
    # }
    return(plot_list)
}

timestamped_message("Reading default cutoff parameters")
# This should point to our DEFAULT values, so needs to be a 'fresh' cutoffs.csv
params <- read.csv(
    DEFAULT_PARAM_CSV,
    header = TRUE, row.names = 1)
params[is.na(params)] <- Inf
print(params)

timestamped_message("Looping through libraries to make zoomed in diagnostic plots")

N_LIBS <- length(LIBRARIES)

for (i in 1:N_LIBS) {
timestamped_message("\tworking on ", LIBRARIES[i])
  raw <- readRDS(paste0(
    SOBJ_DIR, LIBRARIES[i], "/automated_processing/", LIBRARIES[i], "_raw.rds"
  ))
  plot_list <- make_plots(raw, params)
  pdf(
    paste0(
      SOBJ_DIR, LIBRARIES[i], "/automated_processing/", LIBRARIES[i], "_diagnostic_plots_pre__zoomed.pdf"),
    w = 25, h = 21
    )
  print(ggpubr::ggarrange(plotlist=plot_list, nrow = 3, ncol = 3))
  dev.off()
}

timestamped_message("Done")