satijalab / seurat

R toolkit for single cell genomics
http://www.satijalab.org/seurat
Other
2.26k stars 906 forks source link

Extract expression matrix for specific cluster and group in integrated analysis #2825

Closed anotheralexwest closed 4 years ago

anotheralexwest commented 4 years ago

Hi, I have eight samples (AW1 to AW8), these represent four experimental groups, two biological replicates in each group (T1 to T4; T1=AW1+AW2, T2=AW3+AW4, T3=AW5+AW6, T4=AW7+AW8). I have run an integrated analysis on all the samples and want to compare gene expression between the clusters. I plan to use MAST, which needs raw counts - so I want to export raw count data from group within each cluster.

I have tried using this command, which I expected to generate a matrix of raw RNA expression data for the T1 sample point at cluster 5:

raw.data <- as.matrix(
  GetIntegrationData(
    Timepoints.combined, 
    integration.name="T1", 
    slot = "counts")[, WhichCells(Timepoints.combined, ident = "5")]
)

but get the response:

Error in GetIntegrationData(Timepoints.combined, integration.name = "T1",  : 
  Requested integration key does not exist

Any assistance with this, or another way to export raw data from integrated datasets would be greatly appreciated. Cheers, Alex

I post the rest of my script below:

library(plyr)
library(dplyr)
library(Seurat)
library(cowplot)

# Load the datasets, must have different project names to be able to separate them later on
AW1.data <- Read10X(data.dir = "C:\\snRNAseq\\Single sample analysis\\AW1")
AW1 <- CreateSeuratObject(counts = AW1.data, project = "T1",min.cells = 3, min.features = 200)
AW2.data <- Read10X(data.dir = "C:\\snRNAseq\\Single sample analysis\\AW2")
AW2 <- CreateSeuratObject(counts = AW2.data, project = "T1",min.cells = 3, min.features = 200)
AW3.data <- Read10X(data.dir = "C:\\snRNAseq\\Single sample analysis\\AW3")
AW3 <- CreateSeuratObject(counts = AW3.data, project = "T2",min.cells = 3, min.features = 200)
AW4.data <- Read10X(data.dir = "C:\\snRNAseq\\Single sample analysis\\AW4")
AW4 <- CreateSeuratObject(counts = AW4.data, project = "T2",min.cells = 3, min.features = 200)
AW5.data <- Read10X(data.dir = "C:\\snRNAseq\\Single sample analysis\\AW5")
AW5 <- CreateSeuratObject(counts = AW5.data, project = "T3",min.cells = 3, min.features = 200)
AW6.data <- Read10X(data.dir = "C:\\snRNAseq\\Single sample analysis\\AW6")
AW6 <- CreateSeuratObject(counts = AW6.data, project = "T3",min.cells = 3, min.features = 200)
AW7.data <- Read10X(data.dir = "C:\\snRNAseq\\Single sample analysis\\AW7")
AW7 <- CreateSeuratObject(counts = AW7.data, project = "T4",min.cells = 3, min.features = 200)
AW8.data <- Read10X(data.dir = "C:\\snRNAseq\\Single sample analysis\\AW8")
AW8 <- CreateSeuratObject(counts = AW8.data, project = "T4",min.cells = 3, min.features = 200)

AW1 <- subset(AW1, subset = nFeature_RNA > 200 & nFeature_RNA < 2500)
AW2 <- subset(AW2, subset = nFeature_RNA > 200 & nFeature_RNA < 2500)
AW3 <- subset(AW3, subset = nFeature_RNA > 200 & nFeature_RNA < 2500)
AW4 <- subset(AW4, subset = nFeature_RNA > 200 & nFeature_RNA < 2500)
AW5 <- subset(AW5, subset = nFeature_RNA > 200 & nFeature_RNA < 2500)
AW6 <- subset(AW6, subset = nFeature_RNA > 200 & nFeature_RNA < 2500)
AW7 <- subset(AW7, subset = nFeature_RNA > 200 & nFeature_RNA < 2500)
AW8 <- subset(AW8, subset = nFeature_RNA > 200 & nFeature_RNA < 2500)

#Merge Datasets
# # to merge more than two objects, pass one to x and a list of objects to y
# merge(x = pbmc_small, y = c(pbmc_small, pbmc_small))
T1 <- merge(AW1, y = AW2, add.cell.ids = c("AW1","AW2"), project = "Gill")
T2 <- merge(AW3, y = AW4, add.cell.ids = c("AW3","AW4"), project = "Gill")
T3 <- merge(AW5, y = AW6, add.cell.ids = c("AW5","AW6"), project = "Gill")
T4 <- merge(AW7, y = AW8, add.cell.ids = c("AW7","AW8"), project = "Gill")

Combined <- merge(T1, y = c(T2,T3,T4), add.cell.ids = c("T1","T2","T3","T4"), project = "All")
Combined
# An object of class Seurat 
# 34514 features across 18844 samples within 1 assay 
# Active assay: RNA (34514 features)

#Check tops and bottom, also IDs
head(colnames(Combined))
tail(colnames(Combined))
table(Combined$orig.ident)
# T1   T2   T3   T4 
# 4539 4876 4873 4556 

#Split up object 
Timepoints <- SplitObject(Combined, split.by="orig.ident")

#normalise and ID variable features
Timepoints.list <- lapply(X = Timepoints, FUN = function(x) {
  x <- NormalizeData(x,normalization.method = "LogNormalize", scale.factor = 10000)
  x <- FindVariableFeatures(x, selection.method = "vst", nfeatures = 2000)
})

#ID anchors between datasets which takes a list of Seurat objects as input, and use 
#these anchors to integrate the two datasets together with IntegrateData.
#dims defines the number of dimensions used from the CCA (canonical correlation analysis)
#see more in stuwart and butler paper
Gill.anchors <- FindIntegrationAnchors(object.list = Timepoints.list, dims = 1:20)
#might need to increase memory limit in r using:
# memory.size()
# memory.limit(size=NA)
memory.limit(size=20000)
Timepoints.combined <- IntegrateData(anchorset = Gill.anchors, dims = 1:20)
DefaultAssay(Timepoints.combined) <- "integrated"

Timepoints.combined <- ScaleData(Timepoints.combined, verbose = FALSE)
Timepoints.combined <- RunPCA(Timepoints.combined, npcs = 30, verbose = FALSE)
# UMAP and Clustering
Timepoints.combined <- RunUMAP(Timepoints.combined, reduction = "pca", dims = 1:20)
Timepoints.combined <- FindNeighbors(Timepoints.combined, reduction = "pca", dims = 1:20)
Timepoints.combined <- FindClusters(Timepoints.combined, resolution = 0.5)

#Visualise
p1 <- DimPlot(Timepoints.combined, reduction = "umap", group.by = "orig.ident")
p2 <- DimPlot(Timepoints.combined, reduction = "umap", label = TRUE)
plot_grid(p1, p2)

#side by side
DimPlot(Timepoints.combined, reduction = "umap", split.by = "orig.ident", ncol=2, nrow=2, label=T)

#number of cells per cluster
# Get number of cells per cluster and per sample of origin
table(Timepoints.combined@active.ident, Timepoints.combined@meta.data$orig.ident)
# T1  T2  T3  T4
# 0  366 979 792 730
# 1  643 618 685 470
# 2  282 276 468 474
# 3  290 252 451 474
# 4  351 228 354 268
# 5  573 361 120 117
# 6  126 386 241 331
# 7  316 258 235 188
# 8  123 370 236 255
# 9   96 136 287 293
# 10 107 103 186 197
# 11 197  86 160 127
# 12 253 131  65  21
# 13 197 136  75  57
# 14  37  75 142 163
# 15 117 126  77  83
# 16 185 115  54  46
# 17  97 129  62  86
# 18  90  61  85 112
# 19  93  50  98  64

#Make matrix of raw values for each cluster (for MAST)
raw.data <- as.matrix(GetIntegrationData(Timepoints.combined, integration.name="T1", slot = "counts")[, WhichCells(Timepoints.combined, ident = "0")])
andrewwbutler commented 4 years ago

Hi Alex,

To get the matrix of integrated data, you can run the following command.

GetAssayData(Timepoints.combined[["integrated"]], slot = "data") 

This is not count data however. You can think of it as "corrected" normalized data. Another point though, is that we don't recommend running DE tests on the integrated data. See FAQ 4