UCSF-DSCOLAB / data_processing_pipelines

A repository to store the existing pipelines to process the various CoLabs datasets
0 stars 1 forks source link

De-pool piepline for fully demultiplexed scRNAseq data #37

Open dtm2451 opened 9 months ago

dtm2451 commented 9 months ago

Probably a new pipeline entirely, and one that could follow directly after the current single_cell_RNAseq processing pipeline. The primary goal will be separating outputs from the current pipeline per each individual sample so that we have the materials to fill sc_seq model records, in addition the to sc_seq_pool records that current outputs map to directly.

I'll put together a list of attributes here, as well as some pseudo-code to help describe some of the pathing.

dtm2451 commented 9 months ago

Minimal Elements:

dtm2451 commented 9 months ago

Expanded into code =)

### Not sure how this part will work exactly, so at least defining my own versions of variables to use!

# Calling the full pool-level Seurat object: seurat_obj

# A metadata within the Seurat object that I'm expecting holds sample names of each cell = seurat_obj$sc_seq / seurat_obj@meta.data column named 'sc_seq'
samples <- sort(unique(seurat_obj$sc_seq))

# Run properties
pool_name
pool_chemistry # From cellranger count portion or metadata_csv
genome_name # Is this something we have?

# File outputs
filtered_counts_h5_path # (ending .h5)
processed_normalized_counts_h5_path # (ending .h5)
processed_scaled_counts_h5_path # (ending .h5)
processed_metadata_tsv_path # (.tsv)
processed_umap_path # (.png?)
processed_robject_rdata_path # (.RData, unless we want to swap to .Rds???)
record_metadata_path # (.csv) This one is for a polyphemus data_frame linker.

library(DropletUtils)
library(stringr)

### Initialize data.frame for holding record_metadata
# Start from pool record csv that'll come from Issue #35, but edit bits we can consistently set here
pool_record_metadata <- read.csv(pool_metadata_path, row.names=FALSE)
# Change sc_seq to sc_seq_pool, and fill it in!
colnames(pool_record_metadata)[colnames(pool_record_metadata)=="sc_seq"] <- "sc_seq_pool"
pool_record_metadata$sc_seq_pool <- pool_record_metadata$tube_name
# Remove 'parent modality' because it'll all be under the GEX record??
pool_record_metadata$parent_modality <- NULL
# Just keeping column names for the holder
record_metadata <- pool_record_metadata[1,, drop = FALSE][-1,, drop = FALSE]

for (this_sample in samples) {
    obj <- seurat_obj[, seurat_obj$sc_seq==this_sample]

    # counts_h5s
    write_h5 <- function(path, data) {
        write10xCounts(
            path = path,
            # I believe we will need to work on how to encode ADT and other data types. The function does seem limited to writing from a single modality.
            x = data,
            gene.type = "Gene Expression",
            type = "HDF5",
            genome = genome_name,
            version = "3",
            chemistry = pool_chemistry,
            library.ids = pool_name
        )
    }
    write_h5(
        path = filtered_counts_h5_path,
        data = GetAssayData(seurat_obj, assay = "RNA", slot = "counts")
    )
    write_h5(
        path = processed_normalized_counts_h5_path,
        data = GetAssayData(seurat_obj, assay = "RNA", slot = "data")
    )
    write_h5(
        path = processed_scaled_counts_h5_path,
        data = GetAssayData(seurat_obj, assay = "RNA", slot = "scale.data")
    )

    # Cell Metadata
    meta <- cbind(cell_name=rownames(seurat_obj@meta.data), seurat_obj@meta.data)
    write.table(
        meta,
        processed_metadata_tsv_path,
        sep = "\t", row.names = FALSE, col.names = TRUE
    )

    # Add to Record Metadata
    # May wanna double-check per column, but probably a direct copy of the
    # pool record csv of Issue #35 just with:
    # 'tube_name' value switched to name for this record
    # 'cells_loaded' divided by number of samples in the pool
    # 'processed_cells_recovered' based on number here
    new_record_metadata <- pool_record_metadata
    new_record_metadata$tube_name <- this_sample
    new_record_metadata$cells_loaded <- pool_record_metadata$cells_loaded / length(samples)
    new_record_metadata$processed_cells_recovered <- ncol(obj)
    record_metadata <- rbind(record_metadata, new_record_metadata)

    # processed_umap: -- seems like could follow 'make_plots' path towards end of 'bin/process_with_seurat.R'
    #  or if there's an equivalent in 'bin/process_with_seurat_post_filter.R'... scanned only a bit!
    #

    # Seurat object output
    save(
        obj,
        file = processed_robject_rdata_path,
        compress = TRUE
    )
}

# Output Record Metadata
write.csv(
    record_metadata,
    record_metadata_path,
    row.names = FALSE, col.names = TRUE
)

Couple notes: