title: "Template code for single-cell analysis using Bioconductor" author: "Kevin Rue-Albrecht" date: "05/10/2022" output: html_document

knitr::opts_chunk$set(echo = TRUE)

Exercise

Import scRNA-seq data and create a SingleCellExperiment object

Import the filtered matrix into R; use DropletUtils.

Note: use the samples= argument of the DropletUtils::read10xCounts() function to give a memorable name to each sample. Check the difference without using the samples argument.

library(DropletUtils)
setwd("/project/obds/hfuchs/4_scRNAseq/3_singlecell_bioconductor/")
sce <- DropletUtils::read10xCounts(samples = ".", sample.names = "pbmc5k")
#alternative
sce <- DropletUtils::read10xCounts(samples = c("pbmc5k" =                                        "/project/obds/shared/resources/4_r_single_cell/singlecell_bioconductor/filtered_feature_bc_matrix/"))

Print the object. What can you tell about its contents?

sce

What can you tell from the object metadata?

Note: slots of SummarizedExperiment objects are typically accessed using functions of the same name, e.g. metadata().

metadata(sce)
rowData(sce)
colData(sce) #verified that each barcode has been assigned to sample

Exercise

Quality control

Compute and visualise quality control metrics (library size, genes detected, mitochondrial fraction); use scuttle and/or scater.
- Identify mitochondrial genes and pass those to the subsets argument of the scuttle::addPerCellQC() function.
- What is the return value? Where are the quality metrics stored? What is the difference with scuttle::perCellQCMetrics()?

is.mito <- grep("^MT-", rowData(sce)$Symbol,
                value = FALSE) # subset looks for ensmbl rather than gene name, so index more reliable (but useful for checking)

library(scuttle)
sce <- scuttle::addPerCellQC(sce, 
                             subsets = list(MT = is.mito))
rowData(sce)
colData(sce) #this is were MT metrics are specified (+ some overall QC metrics)
#13 mitochondrial genes - in most cells 11/12 i.e. ~90% are detected

Visualise library size, genes detected and mitochondrial fraction as three violin plots; use ggplot2.

#library size
plot1 <- colData(sce) %>%
    as_tibble() %>% 
    ggplot() +
    geom_violin(aes(x = Sample, y = sum)) +
    labs(x = "Total UMI", y = "Value")
#genes detected
plot2 <- colData(sce) %>%
    as_tibble() %>% 
    ggplot() +
    geom_violin(aes(x = Sample, y = detected)) +
    labs(x = "Genes detected", y = "Value")
#mitochondrial fraction
plot3 <- colData(sce) %>%
    as_tibble() %>% 
    ggplot() +
    geom_violin(aes(x = Sample, y = subsets_MT_percent)) +
    labs(x = "Percentage mitochondrial", y = "Value") + 
  geom_jitter(aes(x = Sample, y = subsets_MT_percent), width = 0.5) + #do this properly with beeswarm 
  ylim(0,20)
cowplot::plot_grid(plot1, plot2, plot3, nrow = 1)

Filter cells, keeping those with more than 4,500 UMI, less than 15% mitochondrial UMI, and more than 1,500 genes detected. -> decide this based off violin plot

sce <- sce[, sce$sum > 4500 & sce$subsets_MT_percent < 15 & sce$detected > 1500]

Similarly, use scuttle::perFeatureQCMetrics() or scuttle::addPerFeatureQC() to compute per-feature quality metrics, and visualise those metrics.

sce <- scuttle::addPerFeatureQC(sce)
rowData(sce)

## ggplot2
plot1 <- rowData(sce) %>%
    as_tibble() %>% 
    ggplot() +
    geom_violin(aes(x = "Sample", y = mean)) +
    labs(x = "mean expression", y = "Value")

plot2 <- rowData(sce) %>%
    as_tibble() %>% 
    ggplot() +
    geom_violin(aes(x = "Sample", y = detected)) +
    labs(x = "percentege of cells with non-zero count", y = "Value")
plot3 <- rowData(sce) %>%
  as_tibble() %>% 
  ggplot() + 
  geom_point(aes(y = log(mean), x = detected)) +
  labs(y = "mean expression", x = "percentage of cells with non-zero counts")

cowplot::plot_grid(plot1, plot2,plot3, nrow = 1)

Exercise step 3. Normalisation

Convert the counts into normalized expression values to eliminate cell-specific biases (e.g., in capture efficiency); use scuttle and/or scran. Display the names of the assays available after that step.

Note: use scuttle::logNormCounts() to compute log-normalised counts. What is the return value? Where can you find the normalised counts?

library(scuttle)
sce <- scuttle::logNormCounts(sce) #leave transform as default 
assayNames(sce)
assay(sce, "logcounts")[1:5,100:110]

Plot the variance against the mean of each gene.

Note: how can you tell whether the normalisation was effective? Compare with https://osca.bioconductor.org/feature-selection.html#quantifying-per-gene-variation

library(DelayedMatrixStats)
#
x <- DelayedArray(assay(sce, "counts"))
plot_data <- tibble(
    mean = DelayedMatrixStats::rowMeans2(x),
    variance = DelayedMatrixStats::rowVars(x)
)
plot_counts <- ggplot(plot_data, aes(x = mean, y = variance)) +
    geom_point()
#
x <- DelayedArray(assay(sce, "logcounts"))
plot_data <- tibble(
    mean = DelayedMatrixStats::rowMeans2(x),
    variance = DelayedMatrixStats::rowVars(x)
)
plot_logcounts <- ggplot(plot_data, aes(x = mean, y = variance)) +
    geom_point()
cowplot::plot_grid(plot_counts, plot_logcounts, nrow = 1)

When would you rather use scuttle::computePooledFactors instead?

Exercise

Feature selection

Select features for downstream analyses, e.g. highly variable genes; use scran.

Use scran::modelGeneVar() to model the variance of the log-expression profiles for each gene. What is the output?

library(scran)
dec <- scran::modelGeneVar(sce) #assay.type = "logcounts" is default for S4 objects
dec

Visualise the relation between the mean expression of each gene and the total / biological / technical variance of each gene.

How do you interpret those different values?

ggplot(as_tibble(dec)) +
    geom_point(aes(mean, total), color = "black") + #total variance 
    geom_point(aes(mean, bio), color = "blue") + # (calculated) biological variance (essentially black - red)
    geom_point(aes(mean, tech), color = "red") #technical variance

Use scran::getTopHVGs() to identify highly variable genes (e.g., top 10%).

What is the output? How many genes do you identify? Where are those genes located in the mean vs. (biological) variance plot? What happens to this plot if you set more stringent thresholds to define highly variable genes?

hvg <- scran::getTopHVGs(dec,
                         prop = 0.1) #top 10% variable genes 
length(hvg) #1344 genes

## ggplot2
#option1
dec_variable <- dec[match(hvg,rownames(dec)),] 
dec_variable$hvg <- "variable"
dec_invariable <- dec[-match(hvg,rownames(dec)),] 
dec_invariable$hvg <- "invariable"
dec1 <- rbind(dec_variable,dec_invariable)

#option2
dec$hvg <- row.names(dec) %in% hvg
head(dec)

#plot
ggplot(as.tibble(dec1), aes(mean, bio, color = hvg)) + geom_point()

Exercise

Dimensionality reduction

Apply PCA; use scater or BiocSingular. Set a seed to control reproducibility. List the names of dimensionality reduction results available.

Note: only give the set of highly variable genes to the scater::runPCA() function, to save time, memory, and to focus on biologically informative genes in the data set.

set.seed(1234)
sce <- scater::runPCA(sce,
                      subset_row = hvg)
percent.var <- attr(reducedDim(sce), "percentVar")
library(PCAtools)
chosen.elbow <- findElbowPoint(percent.var) #PCAtools not available
plot(percent.var)

Apply UMAP and t-SNE successively on the output of the PCA. List the names of dimensionality reduction results available each time.

sce <- scater::runUMAP(sce, 
                       ncomponents = 2, #default 
                       dimred = "PCA", 
                       n_dimred = 1:20) #number of principal components to use

sce <- scater::runTSNE(sce)

Visualise the scatterplot of cells produced by each of those dimensionality reduction methods. Considering coloring points with quality control metrics.

sce_umap <- reducedDim(sce,"UMAP") %>% as.data.frame() %>%
  mutate(Barcode = sce@colData@listData$Barcode,
         subsets_MT_percent = sce@colData@listData$subsets_MT_percent,
         sum = sce@colData@listData$sum,
         detected = sce@colData@listData$detected)

sce_tsne <- reducedDim(sce,"TSNE") %>% as.data.frame() %>%
  mutate(Barcode = sce@colData@listData$Barcode,
         subsets_MT_percent = sce@colData@listData$subsets_MT_percent,
         sum = sce@colData@listData$sum,
         detected = sce@colData@listData$detected)

p1 <- ggplot(sce_umap, aes(x = V1, y = V2), fill = subsets_MT_percent) + 
  geom_point() + 
  scale_fill_continuous(type = "gradient") + 
  labs(x = "UMAP1", y = "UMAP2")

p2 <- ggplot(sce_tsne, aes(x = V1, y = V2), fill = subsets_MT_percent) + 
  geom_point() + 
  scale_fill_continuous(type = "gradient") + 
  labs(x = "TSNE1", y = "TSNE2")

cowplot::plot_grid(p1,p2, ncol = 2)

Bonus point

Use scran::denoisePCA() to remove principal components that correspond to technical noise, and compare downstream t-SNE or UMAP with those obtained before de-noising. Name the output sce_denoise. How many components remain after denoising? Visualise a UMAP of the denoised PCA and compare.

sce_denoise <- scran::denoisePCA(   )

sce_denoise <- scater::runUMAP(   )

sce_denoise_umap <- 

plot_grid(
    sce_umap + theme(legend.position = "bottom"),
    sce_denoise_umap + theme(legend.position = "bottom"),
    nrow = 1)

Exercise

Clustering

Cluster cells using scran.

Start with scran::getClusteredPCs() to cluster cells after using varying number of PCs, and pick the number of PCs using a heuristic based on the number of clusters.

output <- scran::getClusteredPCs(reducedDim(sce, "PCA"))
metadata(output)$chosen #21

Use scran::buildSNNGraph() and igraph::cluster_louvain() with that "ideal" number of PCs. Assign the cluster label to a cell metadata column named "label".

g <- scran::buildSNNGraph(sce,
                          #use.dimred = 'PCA') #use.dimred is the alternative to d
                          d = 21) #from function above
colData(sce)[["cluster_louvain"]] <- factor(igraph::cluster_louvain(g)$membership)

Visualise the assigned cluster on your preferred dimensionality reduction layout.

Note: Dimensionality reduction and clustering are two separate methods both based on the PCA coordinates. They may not always agree with each other, often helping to diagnose over- or under-clustering, as well as parameterisation of dimensionality reduction methods.

gg_snn_21<- reducedDim(x = sce, type = "UMAP") %>%
    as.data.frame() %>%
    as_tibble() %>%
    bind_cols(colData(sce) %>% as_tibble()) %>%
    sample_frac() %>%
    ggplot() +
    geom_point(aes(V1, V2, color=cluster_louvain)) +
    cowplot::theme_cowplot()
cowplot::plot_grid(gg_snn,gg_snn_21, ncol = 2)

Bonus point

Test different numbers of principal components and compare results.

snn_plots <- list()
for (d in c(5, 10, 13, 15)) {
    g <- scran::buildSNNGraph(t(reducedDim(sce, "PCA")), d = d)
    colData(sce)[[sprintf("snn_d", d)]] <- factor(igraph::cluster_louvain(g)$membership)
    gg_d <- reducedDim(x = sce, type = "UMAP") %>%
        as.data.frame() %>%
        as_tibble() %>%
        bind_cols(colData(sce) %>% as_tibble()) %>%
        sample_frac() %>%
        ggplot() +
        geom_point(aes(V1, V2, color=snn_d)) +
        labs(title = d) +
        cowplot::theme_cowplot()
    snn_plots[[as.character(d)]] <- gg_d
}
cowplot::plot_grid(plotlist = snn_plots, ncol = 2)

Try scran::quickCluster(); identify key parameters and compare results.

sce$quickCluster <- scran::quickCluster(   )

gg_cluster <- reducedDim(x = sce, type = "UMAP") %>%
    as.data.frame() %>%
    as_tibble() %>%
    bind_cols(colData(sce) %>% as_tibble()) %>%
    sample_frac() %>%
    ggplot() +
    geom_point(aes(V1, V2, color=quickCluster)) +
    cowplot::theme_cowplot()
gg_cluster

Exercise

Cluster markers

Use scran::findMarkers() to identify markers for each cluster. Display the metadata of markers for the first cluster.

markers <- scran::findMarkers(sce,
                              groups = sce$snn_d, 
                              test.type = "t") #default = t test (how valid assumption of normality), other options wilcox (non-parametric - less genes but more confidence), binom - data so sparse in scRNAseq that rank more appropriate maybe
#wilcoxon test does not give fold changes - but these are unreliable anyways, as 

#simple list: more compact than list but same behaviour
markers[[1]]

Visualise the expression of selected markers:
- As a dot plot, optionally with a violin layer.

marker_id <- "ENSG00000090382" 
marker_name <- rowData(sce)[marker_id, "Symbol"]

colData(sce) %>%
    as_tibble() %>%
    mutate(marker = assay(sce, "logcounts")[marker_id, ]) %>% #assay(sce,"logcounts") retrieves gene expression data
    ggplot(aes(cluster_louvain, marker)) +
    geom_violin(aes(fill = cluster_louvain)) +
    geom_point() +
    labs(title = marker_name, subtitle = marker_id) +
    scale_color_viridis_c()

On a dimensionality reduction layout. Compare with the cluster labels.

scater::plotReducedDim(sce, dimred = "UMAP", colour_by = marker_id) + labs(title = marker_name)

reducedDim(x = sce, type = "UMAP") %>%
    as.data.frame() %>%
    as_tibble() %>%
    mutate(marker = assay(sce, "logcounts")[marker_id, ]) %>% #add gene expression from assay
    slice_sample(prop = 1) %>% 
    ggplot() +
    geom_point(aes(V1, V2, color=marker), size = 0.1) +
    scale_color_viridis_c() +
    labs(title = marker_name, subtitle = marker_id) +
    cowplot::theme_cowplot()

Exercise

Interactive visualisation

Use iSEE::iSEE() to launch an interactive web-application to visualise the contents of the SingleCellExperiment object.

library(iSEE)
app <- iSEE(sce)
if (interactive()) {
  shiny::runApp(app)
}

Bonus point

Preconfigure the application to start with a subset of panels, e.g.

initial_panel_list <- list(
  ReducedDimensionPlot(PanelWidth=4L),
  RowDataTable(PanelWidth=8L)
)
app <- iSEE::iSEE(sce, initial = initial_panel_list)
if (interactive()) {
  shiny::runApp(app)
}

hannahsfuchs / OBDS_Course

scRNAseq with Bioconductor #3

title: "Template code for single-cell analysis using Bioconductor" author: "Kevin Rue-Albrecht" date: "05/10/2022" output: html_document

Exercise

Import scRNA-seq data and create a SingleCellExperiment object

Exercise

Quality control

Exercise step 3. Normalisation

Exercise

Feature selection

Exercise

Dimensionality reduction

Bonus point

Exercise

Clustering

Bonus point

Exercise

Cluster markers

Exercise

Interactive visualisation

Bonus point