hannahsfuchs / OBDS_Course

0 stars 0 forks source link

scRNAseq with Seurat #1

Open hannahsfuchs opened 1 year ago

hannahsfuchs commented 1 year ago

title: "Example code for single-cell analysis with Seurat, day 1" author: "Kevin Rue-Albrecht" date: "05/10/2021" output: html_document

knitr::opts_chunk$set(echo = TRUE)
library(Seurat)
library(tidyverse)

Exercise

Import scRNA-seq data and create a Seurat object

read10x_data <- Read10X("/project/obds/shared/resources/4_r_single_cell/singlecell_seuratday1/filtered_feature_bc_matrix")
#change to match Ensembl ID by changing gene.column value, but may be more useful later when it's more clear what is interesting 

Answer:

class(read10x_data) 
read10x_data[1:5,1:16] 

Answer:dots are features of sparse matrix - othewise each zero = rowID, colID, content = three values

read10x_data@Dim
glimpse(read10x_data@Dimnames)

Answer: 33538 features, 5155 barcodes

seurat_object <- CreateSeuratObject(counts = read10x_data, 
                                    project = "pmbc5k", 
                                    min.cells = 3, #arbitrary - set based on analysis
                                    min.features = 200) #arbitrary - depends on analysis 
#project = one dataset, relevant when wanting to merge batches later 
seurat_object

Answer: 19037 features, 5100 barcodes

dim(read10x_data) - dim(seurat_object)

lost 55 barcodes and 14501 features

Exercise

Accessing the contents of a Seurat object

seurat_object@active.assay
DefaultAssay(seurat_object) #if there is a function, generally better to use function, relies on structure staying constant, so code might stop working if developers change it 
seurat_object@assays #only one 
Assays(seurat_object) #will return just the names, more user-friendly
seurat_object@assays$RNA@data[1:6,1:6] #see above
GetAssayData(seurat_object,assay = "RNA",slot = "data")[1:6,1:6]

Answer:

seurat_object[[]][1:6,]
head(seurat_object[[]])

Answer:orig.ident, nCount_RNA, nFeature_RNA

seurat_object[["nCount_RNA"]]
class(seurat_object[["nCount_RNA"]])

Answer: factor

head(seurat_object$nCount_RNA)
class(seurat_object$nCount_RNA)

Answer: numeric - named vector

FetchData(seurat_object, vars = c("LYZ","nCount_RNA"), slot = "data")[1:6,]
head(FetchData(seurat_object, vars = c("LYZ"), slot = "data")) #head > [] if only queryign one variable

#variables can come from different location within the R object to generate ggplot-friendly dataframe
#e.g. assays and metadata together here 

data.frame

Demo

Common operations on Seurat objects

WhichCells() returns the names of cells that match a logical expression.

WhichCells(seurat_object, expression = LYZ > 500)

VariableFeatures() returns the names of variable features (for a given assay, if computed).

VariableFeatures(seurat_object)

subset() returns a new Seurat object restricted to certain features and cells.

subset(
    x = seurat_object,
    cells = WhichCells(seurat_object, expression = LYZ > 500),
    features = VariableFeatures(object = seurat_object)
)

Exercise

Quality control and visualisation

VlnPlot(seurat_object, features = c("nCount_RNA","nFeature_RNA"))
seurat_object[["percent.mt"]] <- PercentageFeatureSet(seurat_object,pattern = "^MT-")
VlnPlot(seurat_object, features = c("nCount_RNA", "nFeature_RNA", "percent.mt"))
FeatureScatter(seurat_object, "percent.mt","nFeature_RNA")
seurat_after_qc <- subset(seurat_object, 
                          subset = nCount_RNA > 4500 & percent.mt < 15 & nFeature_RNA > 1500)
seurat_after_qc
dim(seurat_object) - dim(seurat_after_qc)

Answer: 896 cells removed

Exercise

Normalisation

seurat_after_qc <- NormalizeData(seurat_after_qc, normalization.method = "LogNormalize")
#default scale.factor is 10,000 (i.e. count per 10,000) - this is an arbitrary number
#if data is good quality, any normalisation approach should give you pretty much the same reusults
#might lose the odd differentially expressed gene, or a bit of precision, but 99% should be the same

Bonus

GetAssayData(seurat_object, slot = "counts") 
GetAssayData(seurat_after_qc, slot = "counts")

ggplot_lyz_raw <- ggplot(FetchData(seurat_object, vars = "LYZ", slot = "data"),
                         aes(x = LYZ)) +
    geom_histogram() +
    coord_cartesian(ylim = c(0, 500)) +
    cowplot::theme_cowplot()
ggplot_lyz_normalised <- ggplot(FetchData(seurat_after_qc, vars = "LYZ", slot = "data"),
                         aes(x = LYZ)) +
    geom_histogram() +
    coord_cartesian(ylim = c(0, 500)) +
    cowplot::theme_cowplot()
cowplot::plot_grid(ggplot_lyz_raw, ggplot_lyz_normalised, ncol = 1)

Exercise

Variable features and scaling

seurat_after_qc <- FindVariableFeatures(seurat_after_qc,
                                        selection.method = "vst",
                                        nfeatures = 2000)

Answer:only 2000 genes taken forward in analysis compared to 4202

VariableFeatures(seurat_after_qc)[1:10]

Answer:

VariableFeaturePlot(seurat_after_qc)

Answer: define a sensible cut-off for number of variable features

seurat_after_qc <- ScaleData(seurat_after_qc,
                             vars.to.regress = c("nCount_RNA","percent.mt"))

Answer:

Exercise

Dimensionality reduction

seurat_after_qc <- RunPCA(seurat_after_qc, 
                          features = NULL, #default = run on variable fetaures 
                          reduction.name = "pca")

Answer:message output: top genes contributing to respective principal components

Reductions(seurat_after_qc)
PCAPlot(seurat_after_qc)

Bonus

# Use this code chunk to prepare a data.frame for ggplot2
pca_data <- FetchData(seurat_after_qc,
                      vars = c("PC_1","PC_2"))
head(pca_data)
ggplot(pca_data,aes(x = PC_1, y = PC_2)) +
    geom_point(size = 0.2) +
    cowplot::theme_cowplot()
ElbowPlot(seurat_after_qc, ndims = 50, reduction = "pca")

would use 20 principal components

seurat_after_qc <- RunUMAP(seurat_after_qc, 
                           dims = 1:20, #set if features NULL (could say e.g. features c(...) + PC20 )
                           n.components = 2)
DimPlot(seurat_after_qc, reduction = "umap")
UMAPPlot(seurat_after_qc)

Exercise

Clustering

seurat_after_qc <- FindNeighbors(seurat_after_qc)
#useful to start with default for k - but the larger your dataset, i.e. the more cells 

Answer:

The help page states that the function FindNeighbors() uses principal components 1 through 10, by default.

seurat_after_qc@graphs
res <- c(0.3,0.5,0.7,0.9)
seurat_after_qc <- FindClusters(seurat_after_qc, 
                                resolution  = res, 
                                algorithm = 1) #Community detection algorithm (default is Louvain)

#introducing resolutions as a vector stores them all as individual columns in metadata
library(cowplot)
cluster_resolution_plots <- lapply(
  res, 
  FUN = function(x) UMAPPlot(seurat_after_qc, group.by = paste0("RNA_snn_res.",x), label = TRUE))
plot_grid(
  cluster_resolution_plots[[1]],
  cluster_resolution_plots[[2]],
  cluster_resolution_plots[[3]],
  cluster_resolution_plots[[4]],
  ncol = 2
)

Exercise

Identify cluster markers

Idents(seurat_after_qc) <- "RNA_snn_res.0.5" #make sure to chose the right resolution
seurat_markers_all <- FindAllMarkers(
    seurat_after_qc,
    features = NULL, #default to use all genes
    logfc.threshold = 0.25,
    min.pct = 0.25)
class(seurat_markers_all)

Answer:data frame

head(seurat_markers_all)

Answer: cluster column gives cluster identity. pct1 - expression percentag ewithin cluster, pct.2 = expression outside of cluster

seurat_markers_all %>% group_by(cluster) %>% arrange(abs(avg_log2FC)) %>% top_n(3) 

top4_3 <- seurat_markers_all %>% filter(cluster == 3) %>% 
  filter(p_val_adj < 0.0001) %>% #could probably do this in FindAllMarkers()too
  arrange(desc(avg_log2FC)) %>% #can choose abs(avg_log2FC) too, for heatmap positive changes can be easier to interpret
  slice_head(n = 4) %>% #better than top_n() as that will sort on the fly by last variable in the table if not specified 
  select(gene) %>% unlist()
FeaturePlot(seurat_after_qc, 
            reduction = "umap",
            features = top4_3, 
            label = TRUE)
VlnPlot(seurat_after_qc, features = top4_3)

Answer:

markers_top10_clusters <- seurat_markers_all %>%
  filter(p_val_adj < 0.0001) %>% 
  group_by(cluster) %>%
  arrange(desc(avg_log2FC)) %>%
  slice_head(n = 10) 

DoHeatmap(seurat_after_qc,
          features = markers_top10_clusters$gene)