hannahsfuchs / OBDS_Course

0 stars 0 forks source link

scRNAseq with Bioconductor #3

Open hannahsfuchs opened 1 year ago

hannahsfuchs commented 1 year ago

title: "Template code for single-cell analysis using Bioconductor" author: "Kevin Rue-Albrecht" date: "05/10/2022" output: html_document

knitr::opts_chunk$set(echo = TRUE)

Exercise

Import scRNA-seq data and create a SingleCellExperiment object

Note: use the samples= argument of the DropletUtils::read10xCounts() function to give a memorable name to each sample. Check the difference without using the samples argument.

library(DropletUtils)
setwd("/project/obds/hfuchs/4_scRNAseq/3_singlecell_bioconductor/")
sce <- DropletUtils::read10xCounts(samples = ".", sample.names = "pbmc5k")
#alternative
sce <- DropletUtils::read10xCounts(samples = c("pbmc5k" =                                        "/project/obds/shared/resources/4_r_single_cell/singlecell_bioconductor/filtered_feature_bc_matrix/"))
sce

Answer:

Note: slots of SummarizedExperiment objects are typically accessed using functions of the same name, e.g. metadata().

metadata(sce)
rowData(sce)
colData(sce) #verified that each barcode has been assigned to sample

Answer: not much - just stored file path

Exercise

Quality control

is.mito <- grep("^MT-", rowData(sce)$Symbol,
                value = FALSE) # subset looks for ensmbl rather than gene name, so index more reliable (but useful for checking)
library(scuttle)
sce <- scuttle::addPerCellQC(sce, 
                             subsets = list(MT = is.mito))
rowData(sce)
colData(sce) #this is were MT metrics are specified (+ some overall QC metrics)
#13 mitochondrial genes - in most cells 11/12 i.e. ~90% are detected

Answer:

#library size
plot1 <- colData(sce) %>%
    as_tibble() %>% 
    ggplot() +
    geom_violin(aes(x = Sample, y = sum)) +
    labs(x = "Total UMI", y = "Value")
#genes detected
plot2 <- colData(sce) %>%
    as_tibble() %>% 
    ggplot() +
    geom_violin(aes(x = Sample, y = detected)) +
    labs(x = "Genes detected", y = "Value")
#mitochondrial fraction
plot3 <- colData(sce) %>%
    as_tibble() %>% 
    ggplot() +
    geom_violin(aes(x = Sample, y = subsets_MT_percent)) +
    labs(x = "Percentage mitochondrial", y = "Value") + 
  geom_jitter(aes(x = Sample, y = subsets_MT_percent), width = 0.5) + #do this properly with beeswarm 
  ylim(0,20)
cowplot::plot_grid(plot1, plot2, plot3, nrow = 1)
sce <- sce[, sce$sum > 4500 & sce$subsets_MT_percent < 15 & sce$detected > 1500]
sce <- scuttle::addPerFeatureQC(sce)
rowData(sce)
## ggplot2
plot1 <- rowData(sce) %>%
    as_tibble() %>% 
    ggplot() +
    geom_violin(aes(x = "Sample", y = mean)) +
    labs(x = "mean expression", y = "Value")

plot2 <- rowData(sce) %>%
    as_tibble() %>% 
    ggplot() +
    geom_violin(aes(x = "Sample", y = detected)) +
    labs(x = "percentege of cells with non-zero count", y = "Value")
plot3 <- rowData(sce) %>%
  as_tibble() %>% 
  ggplot() + 
  geom_point(aes(y = log(mean), x = detected)) +
  labs(y = "mean expression", x = "percentage of cells with non-zero counts")

cowplot::plot_grid(plot1, plot2,plot3, nrow = 1)

Exercise step 3. Normalisation

Note: use scuttle::logNormCounts() to compute log-normalised counts. What is the return value? Where can you find the normalised counts?

library(scuttle)
sce <- scuttle::logNormCounts(sce) #leave transform as default 
assayNames(sce)
assay(sce, "logcounts")[1:5,100:110]

Answer:

Note: how can you tell whether the normalisation was effective? Compare with https://osca.bioconductor.org/feature-selection.html#quantifying-per-gene-variation

library(DelayedMatrixStats)
#
x <- DelayedArray(assay(sce, "counts"))
plot_data <- tibble(
    mean = DelayedMatrixStats::rowMeans2(x),
    variance = DelayedMatrixStats::rowVars(x)
)
plot_counts <- ggplot(plot_data, aes(x = mean, y = variance)) +
    geom_point()
#
x <- DelayedArray(assay(sce, "logcounts"))
plot_data <- tibble(
    mean = DelayedMatrixStats::rowMeans2(x),
    variance = DelayedMatrixStats::rowVars(x)
)
plot_logcounts <- ggplot(plot_data, aes(x = mean, y = variance)) +
    geom_point()
cowplot::plot_grid(plot_counts, plot_logcounts, nrow = 1)

Answer:

Answer:

Exercise

Feature selection

Select features for downstream analyses, e.g. highly variable genes; use scran.

library(scran)
dec <- scran::modelGeneVar(sce) #assay.type = "logcounts" is default for S4 objects
dec

Answer:

How do you interpret those different values?

ggplot(as_tibble(dec)) +
    geom_point(aes(mean, total), color = "black") + #total variance 
    geom_point(aes(mean, bio), color = "blue") + # (calculated) biological variance (essentially black - red)
    geom_point(aes(mean, tech), color = "red") #technical variance 

Answer:

What is the output? How many genes do you identify? Where are those genes located in the mean vs. (biological) variance plot? What happens to this plot if you set more stringent thresholds to define highly variable genes?

hvg <- scran::getTopHVGs(dec,
                         prop = 0.1) #top 10% variable genes 
length(hvg) #1344 genes 
## ggplot2
#option1
dec_variable <- dec[match(hvg,rownames(dec)),] 
dec_variable$hvg <- "variable"
dec_invariable <- dec[-match(hvg,rownames(dec)),] 
dec_invariable$hvg <- "invariable"
dec1 <- rbind(dec_variable,dec_invariable)

#option2
dec$hvg <- row.names(dec) %in% hvg
head(dec)

#plot
ggplot(as.tibble(dec1), aes(mean, bio, color = hvg)) + geom_point()

Answer:

Exercise

Dimensionality reduction

Note: only give the set of highly variable genes to the scater::runPCA() function, to save time, memory, and to focus on biologically informative genes in the data set.

set.seed(1234)
sce <- scater::runPCA(sce,
                      subset_row = hvg)
percent.var <- attr(reducedDim(sce), "percentVar")
library(PCAtools)
chosen.elbow <- findElbowPoint(percent.var) #PCAtools not available
plot(percent.var)
sce <- scater::runUMAP(sce, 
                       ncomponents = 2, #default 
                       dimred = "PCA", 
                       n_dimred = 1:20) #number of principal components to use 
sce <- scater::runTSNE(sce)
sce_umap <- reducedDim(sce,"UMAP") %>% as.data.frame() %>%
  mutate(Barcode = sce@colData@listData$Barcode,
         subsets_MT_percent = sce@colData@listData$subsets_MT_percent,
         sum = sce@colData@listData$sum,
         detected = sce@colData@listData$detected)

sce_tsne <- reducedDim(sce,"TSNE") %>% as.data.frame() %>%
  mutate(Barcode = sce@colData@listData$Barcode,
         subsets_MT_percent = sce@colData@listData$subsets_MT_percent,
         sum = sce@colData@listData$sum,
         detected = sce@colData@listData$detected)

p1 <- ggplot(sce_umap, aes(x = V1, y = V2), fill = subsets_MT_percent) + 
  geom_point() + 
  scale_fill_continuous(type = "gradient") + 
  labs(x = "UMAP1", y = "UMAP2")

p2 <- ggplot(sce_tsne, aes(x = V1, y = V2), fill = subsets_MT_percent) + 
  geom_point() + 
  scale_fill_continuous(type = "gradient") + 
  labs(x = "TSNE1", y = "TSNE2")

cowplot::plot_grid(p1,p2, ncol = 2)

Bonus point

sce_denoise <- scran::denoisePCA(   )

Answer:

sce_denoise <- scater::runUMAP(   )
sce_denoise_umap <- 

plot_grid(
    sce_umap + theme(legend.position = "bottom"),
    sce_denoise_umap + theme(legend.position = "bottom"),
    nrow = 1)

Exercise

Clustering

Cluster cells using scran.

output <- scran::getClusteredPCs(reducedDim(sce, "PCA"))
metadata(output)$chosen #21
g <- scran::buildSNNGraph(sce,
                          #use.dimred = 'PCA') #use.dimred is the alternative to d
                          d = 21) #from function above
colData(sce)[["cluster_louvain"]] <- factor(igraph::cluster_louvain(g)$membership)  

Note: Dimensionality reduction and clustering are two separate methods both based on the PCA coordinates. They may not always agree with each other, often helping to diagnose over- or under-clustering, as well as parameterisation of dimensionality reduction methods.

gg_snn_21<- reducedDim(x = sce, type = "UMAP") %>%
    as.data.frame() %>%
    as_tibble() %>%
    bind_cols(colData(sce) %>% as_tibble()) %>%
    sample_frac() %>%
    ggplot() +
    geom_point(aes(V1, V2, color=cluster_louvain)) +
    cowplot::theme_cowplot()
cowplot::plot_grid(gg_snn,gg_snn_21, ncol = 2)

Bonus point

snn_plots <- list()
for (d in c(5, 10, 13, 15)) {
    g <- scran::buildSNNGraph(t(reducedDim(sce, "PCA")), d = d)
    colData(sce)[[sprintf("snn_d", d)]] <- factor(igraph::cluster_louvain(g)$membership)
    gg_d <- reducedDim(x = sce, type = "UMAP") %>%
        as.data.frame() %>%
        as_tibble() %>%
        bind_cols(colData(sce) %>% as_tibble()) %>%
        sample_frac() %>%
        ggplot() +
        geom_point(aes(V1, V2, color=snn_d)) +
        labs(title = d) +
        cowplot::theme_cowplot()
    snn_plots[[as.character(d)]] <- gg_d
}
cowplot::plot_grid(plotlist = snn_plots, ncol = 2)
sce$quickCluster <- scran::quickCluster(   )

gg_cluster <- reducedDim(x = sce, type = "UMAP") %>%
    as.data.frame() %>%
    as_tibble() %>%
    bind_cols(colData(sce) %>% as_tibble()) %>%
    sample_frac() %>%
    ggplot() +
    geom_point(aes(V1, V2, color=quickCluster)) +
    cowplot::theme_cowplot()
gg_cluster

Exercise

Cluster markers

markers <- scran::findMarkers(sce,
                              groups = sce$snn_d, 
                              test.type = "t") #default = t test (how valid assumption of normality), other options wilcox (non-parametric - less genes but more confidence), binom - data so sparse in scRNAseq that rank more appropriate maybe
#wilcoxon test does not give fold changes - but these are unreliable anyways, as 

#simple list: more compact than list but same behaviour
markers[[1]]
marker_id <- "ENSG00000090382" 
marker_name <- rowData(sce)[marker_id, "Symbol"]

colData(sce) %>%
    as_tibble() %>%
    mutate(marker = assay(sce, "logcounts")[marker_id, ]) %>% #assay(sce,"logcounts") retrieves gene expression data
    ggplot(aes(cluster_louvain, marker)) +
    geom_violin(aes(fill = cluster_louvain)) +
    geom_point() +
    labs(title = marker_name, subtitle = marker_id) +
    scale_color_viridis_c()
scater::plotReducedDim(sce, dimred = "UMAP", colour_by = marker_id) + labs(title = marker_name)

reducedDim(x = sce, type = "UMAP") %>%
    as.data.frame() %>%
    as_tibble() %>%
    mutate(marker = assay(sce, "logcounts")[marker_id, ]) %>% #add gene expression from assay
    slice_sample(prop = 1) %>% 
    ggplot() +
    geom_point(aes(V1, V2, color=marker), size = 0.1) +
    scale_color_viridis_c() +
    labs(title = marker_name, subtitle = marker_id) +
    cowplot::theme_cowplot()

Exercise

Interactive visualisation

library(iSEE)
app <- iSEE(sce)
if (interactive()) {
  shiny::runApp(app)
}

Bonus point

initial_panel_list <- list(
  ReducedDimensionPlot(PanelWidth=4L),
  RowDataTable(PanelWidth=8L)
)
app <- iSEE::iSEE(sce, initial = initial_panel_list)
if (interactive()) {
  shiny::runApp(app)
}