HelenaLC / CATALYST

Cytometry dATa anALYsis Tools
66 stars 31 forks source link

Here's some code for filtering sce by number of events and updating metadata in existing sce. #295

Closed david-priest closed 2 years ago

david-priest commented 2 years ago
  1. Filter an SCE by number of events/cells
filterSCEvents <- function(sce, n_cells = NULL)
{
test = table(sce$sample_id)
test = test[test>n_cells]
test = as.data.frame(test)
sce = filterSCE(sce, sample_id %in% test$Var1)
return(sce)
}
  1. Update metadata in existing sce
#metadata file in working directory, needs to have sample_id, condition, other also can be put in
md_u <- read_excel("metadata_update.xlsx")
md_u
#https://stackoverflow.com/questions/35636315/replace-values-in-a-dataframe-based-on-lookup-table
df = as.data.frame(sce@colData$sample_id)
lookup = md_u
new <- df  # create a copy of df
## using lapply, loop over columns and match values to the look up table. store in "new".
new[] <- lapply(df, function(x) lookup$FIELD[match(x, lookup$sample_id)])
sce$FIELD = as.factor(new$`sce@colData$sample_id`)
  1. Extract data from updated sce. ei(sce) does not seem to extract updated fields. This function gets the cluster frequencies for when you want to plotAbundances with a customised ggplot.
sceget <- function(sce, k = "merging1", meta = c("FIELD1", "FIELD2"), meta2 = "sample_id + FIELD1 + FIELD2 ~ cluster_id")

{

# Make a data frame containing cluster abundances
ns <- table(cluster_id = cluster_ids(sce, k), sample_id = sample_ids(sce))
fq <- prop.table(ns, 2) * 100
df <- as.data.frame(fq)
m <- match(df$sample_id, sce$sample_id)

# Extract any other metadata into the df data frame (replace the variables in c() with your own choices).
for (i in meta) df[[i]] <- sce[[i]][m]

# dcast to get rows as sample IDs
df = dcast(df, meta2, value.var = "Freq")

return(df)
}
  1. Hacked together function for alternative display of Heatmap (doesn't work with "both"). This function also provides catalyst's choice for merging for when you need to catch selected clusters from som100 and bring them down to e.g. meta20. Channel names are plotted at 45° I could not work out how to rotate merging_IDs 45° I could also not work out how to remove the original perpendicular channel names.

    
    # Custom plotExprHeatmap
    plotExprHeatmap1 <- function (x, features = NULL, by = c("sample_id", "cluster_id", 
    "both"), k = "meta20", m = NULL, assay = "exprs", fun = c("median", 
    "mean", "sum"), scale = c("first", "last", "never"), q = 0.01, 
    row_anno = TRUE, row_names = FALSE, col_anno = TRUE, row_clust = TRUE, col_clust = TRUE, 
    row_dend = TRUE, col_dend = TRUE, bars = FALSE, perc = FALSE, 
    bin_anno = FALSE, hm_pal = rev(brewer.pal(11, "RdYlBu")), 
    k_pal = CATALYST:::.cluster_cols, m_pal = k_pal, distance = c("euclidean", 
        "maximum", "manhattan", "canberra", "binary", "minkowski"), 
    linkage = c("average", "ward.D", "single", "complete", "mcquitty", 
        "median", "centroid", "ward.D2")) 
    {
    args <- as.list(environment())
    # CATALYST:::.check_args_plotExprHeatmap(args)
    distance <- match.arg(distance)
    linkage <- match.arg(linkage)
    scale <- match.arg(scale)
    fun <- match.arg(fun)
    by <- match.arg(by)
    x <- x[unique(CATALYST:::.get_features(x, features)), ]
    if (by != "sample_id") {
        CATALYST:::.check_k(x, k)
        x$cluster_id <- cluster_ids(x, k)
    }
    if (by == "both") 
        by <- c("cluster_id", "sample_id")
    .do_agg <- function() {
        z <- CATALYST:::.agg(x, by, fun, assay)
        if (length(by) == 1) 
            return(z)
        set_rownames(do.call("rbind", z), levels(x$cluster_id))
    }
    .do_scale <- function() {
        if (scale == "first") {
            z <- assay(x, assay)
            z <- CATALYST:::.scale_exprs(z, 1, q)
            assay(x, assay, FALSE) <- z
            return(x)
        }
        else CATALYST:::.scale_exprs(z, 1, q)
    }
    z <- switch(scale, first = {
        x <- .do_scale()
        .do_agg()
    }, last = {
        z <- .do_agg()
        .do_scale()
    }, never = {
        .do_agg()
    })
    if (length(by) == 1) 
        z <- t(z)
    if (scale != "never" && !(assay == "counts" && fun == "sum")) {
        qs <- round(quantile(z, c(0.01, 0.99)) * 5)/5
        lgd_aes <- list(at = seq(qs[1], qs[2], 0.2))
    }
    else lgd_aes <- list()
    lgd_aes$title_gp <- gpar(fontsize = 10, fontface = "bold", 
        lineheight = 0.8)
    if (!isFALSE(row_anno)) {
        left_anno <- switch(by[1], sample_id = CATALYST:::.anno_factors(x, 
            levels(x$sample_id), row_anno, "row"), CATALYST:::.anno_clusters(x, 
            k, m, k_pal, m_pal))
        catalyst_merge <<- merging_ids(x, k, m, k_pal, m_pal) # This will edit a global variable in the workspace with catalyst's choice for merging.
    }
    else left_anno <- NULL
    if (!isFALSE(col_anno) && length(by) == 2) {
        top_anno <- CATALYST:::.anno_factors(x, levels(x$sample_id), col_anno, 
            "colum")
    }
    else top_anno <- NULL
    if (bars) {
        right_anno <- anno_counts(x[[by[1]]], perc)
    }
    else right_anno <- NULL
    if (bin_anno) {
        cell_fun <- function(j, i, x, y, ...) grid.text(gp = gpar(fontsize = 8), 
            sprintf("%.2f", z[i, j]), x, y)
    }
    else cell_fun <- NULL
    a <- ifelse(assay == "exprs", "expression", assay)
    f <- switch(fun, median = "med", fun)
    hm_title <- switch(scale, first = sprintf("%s %s\n%s", fun, 
        "scaled", a), last = sprintf("%s %s\n%s", "scaled", fun, 
        a), never = paste(fun, a, sep = "\n"))
    if (length(by) == 2) {
        col_title <- features
    }
    else if (length(features) == 1 && features %in% c("type", 
        "state")) {
        col_title <- paste0(features, "_markers")
    }
    else col_title <- ""
    
    cn <- colnames(z) # Get the colunm names (names of each channel)
    #cn <- strsplit(cn,"_")  # Split the channel names by underscore. Comment this out or change to another symbol if needed
    #cn <- sapply(cn,"[",2)  # Get the second part (marker names) # Comment out if needed.
    
    Heatmap(matrix = z, name = hm_title, 
        col = colorRamp2(seq(min(z), 
        max(z), l = n <- 100), 
        colorRampPalette(hm_pal)(n)), 
        column_title = col_title, column_title_side = ifelse(length(by) == 2, "top", "bottom"), 
        cell_fun = cell_fun, 
        cluster_rows = row_clust, 
        cluster_columns = col_clust, 
        show_row_dend = row_dend, 
        show_column_dend = col_dend, 
        clustering_distance_rows = distance, 
        clustering_method_rows = linkage, 
        clustering_distance_columns = distance, 
        clustering_method_columns = linkage, 
        show_row_names = row_names, # whether to show row names
        row_names_side = ifelse(by[1] == "cluster_id" || isFALSE(row_anno) && !row_dend || isFALSE(row_clust), "left", "right"), #which side for row names
        top_annotation = top_anno,  # Not sure what top annotation is
        left_annotation = left_anno, # Left annotation is the colours for clusters and merging (not row names)
        right_annotation = right_anno, # Right annotation is the bar plots and %
        rect_gp = gpar(col = "white"), # How to draw heatmap body colours
        heatmap_legend_param = lgd_aes, 
        row_names_gp = gpar(fontsize = 12), # Font size for row names (clusters)
        bottom_annotation = HeatmapAnnotation(text = anno_text(cn, rot = 45, offset = unit(1, "npc"), just = "right"), annotation_height = max_text_width(cn))) # Make bottom annotations 45 degrees.
    
    #return(cn)

}

david-priest commented 2 years ago

Thanks and I saw you already addressed filtering by number of cells.