KrishnaswamyLab / PHATE

PHATE (Potential of Heat-diffusion for Affinity-based Transition Embedding) is a tool for visualizing high dimensional data.
http://phate.readthedocs.io
Other
472 stars 74 forks source link

Totally different result between phateR and RunPHATE #93

Open qiuhuiqq opened 4 years ago

qiuhuiqq commented 4 years ago

Hi, I recently tried to run the EB differentiation PHATE tutorial. The tutorial is run in python, but I am more used to R, so I tried to run using RunPHATE() function implemented in seurat ( https://github.com/scottgigante/seurat/tree/patch/add-PHATE-again ). But the resulting plot looks nothing like expected: 2.EB_differentiation_UMAP_tSNE_PHATE_compare.pdf

However, when I tried to run the same tutorial manually (not using seurat), the result is comparable to expected outcome: 3.EB_diff_manually_PHATE.pdf

I think this is caused by scaling in seurat, but I don't how to fix this issue. Any suggestions?

Here is my code using RunPHATE() in seurat (which did not produce correct result):

library(phateR)

library(reticulate)
use_condaenv(condaenv = "for_phate", conda = "/home/qiuhui/.conda/envs/py36/bin/conda",required = T)
reticulate::py_discover_config("phate")
reticulate::import("phate")

library("Seurat", lib.loc="~/R/test_packages_2/")
RunPHATE()
?RunPHATE

library(cowplot)

# perform cell filtering  based on quantile
input_and_filter_cells_by_lib_quantile <- function(file_path,project){
  T1 <- Read10X(file_path,unique.features = T)
  # seurat recommend filter genes while constructing seurat object
  T1.seurat <- CreateSeuratObject(counts = T1, project = project, min.cells = 10)
  ## filter top and bottom 20% of cells
  quantiles <- quantile(T1.seurat@meta.data$nCount_RNA,probs = c(0.2,0.8))
  T1.seurat <- subset(T1.seurat, subset = nCount_RNA > quantiles[1] & nCount_RNA < quantiles[2])
  return(T1.seurat)
}

T1.seurat <- input_and_filter_cells_by_lib_quantile("./EB_differentiation_data/T0_1A/", "T1")
T2.seurat <- input_and_filter_cells_by_lib_quantile("./EB_differentiation_data/T2_3B/", "T2")
T3.seurat <- input_and_filter_cells_by_lib_quantile("./EB_differentiation_data/T4_5C/", "T3")
T4.seurat <- input_and_filter_cells_by_lib_quantile("./EB_differentiation_data/T6_7D/", "T4")
T5.seurat <- input_and_filter_cells_by_lib_quantile("./EB_differentiation_data/T8_9E/", "T5")

all.seurat <- merge(T1.seurat,y = c(T2.seurat,T3.seurat,T4.seurat,T5.seurat),
                    add.cell.ids = c("T1","T2","T3","T4","T5"),project = "EB_diff")
table(all.seurat@meta.data$orig.ident)*100/ncol(all.seurat)

rm(T1.seurat,T2.seurat,T3.seurat,T4.seurat,T5.seurat)
#saveRDS(object = all.seurat, file = "EB_diff.all.seurat.rds")

## filter by MT genes
all.seurat[["percent.mt"]] <- PercentageFeatureSet(all.seurat, pattern = "^MT-")
summary(all.seurat[["percent.mt"]])   # 3rd quantile: 2.425
quantile(all.seurat@meta.data$percent.mt,probs = c(0.9))  # 90% quantile: 3.026
plot(density(all.seurat@meta.data$percent.mt))
VlnPlot(all.seurat,features = c("percent.mt"))

all.seurat <- subset(all.seurat, subset = percent.mt < 3)  # 16740 cells after filtering, 17162 genes

##### start seurat 3 analysis
all.seurat <- NormalizeData(all.seurat, verbose = FALSE)
all.seurat <- FindVariableFeatures(all.seurat, selection.method = "vst", nfeatures = 2000)

# Run the standard workflow for visualization and clustering
all.seurat <- ScaleData(all.seurat, verbose = FALSE)
all.seurat <- RunPCA(all.seurat, npcs = 30, verbose = FALSE)
# UMAP, tSNE and Clustering
all.seurat <- RunUMAP(all.seurat, reduction = "pca", dims = 1:20)
all.seurat <- RunTSNE(all.seurat, reduction = "pca", dims = 1:20)
all.seurat <- FindNeighbors(all.seurat, reduction = "pca", dims = 1:20)  # build SNN graph for cell clustering, not data integration
all.seurat <- FindClusters(all.seurat, resolution = 0.5)

# Visualization
plot_grid(DimPlot(all.seurat, reduction = "umap", label = TRUE),
          DimPlot(all.seurat, reduction = "umap", group.by = "orig.ident", cols = c("#9E0142","#F98E52","#FFFFBE","#86CFA5","#5E4FA2")),
          ncol = 2)
plot_grid(DimPlot(all.seurat, reduction = "tsne", label = TRUE),
          DimPlot(all.seurat, reduction = "tsne", group.by = "orig.ident", cols = c("#9E0142","#F98E52","#FFFFBE","#86CFA5","#5E4FA2")),
          ncol = 2)
# compare UMAP and tSNE
plot_grid(DimPlot(all.seurat, reduction = "umap", group.by = "orig.ident", cols = c("#9E0142","#F98E52","#FFFFBE","#86CFA5","#5E4FA2")),
          DimPlot(all.seurat, reduction = "tsne", group.by = "orig.ident", cols = c("#9E0142","#F98E52","#FFFFBE","#86CFA5","#5E4FA2")),
          ncol = 2)

################################# run PHATE ########################################

all.seurat <- RunPHATE(all.seurat, reduction = "pca", dims = 1:20,
                       knn = 4, decay = 15, t=12)
all.seurat <- RunPHATE(all.seurat)
all.seurat <- RunPHATE(all.seurat, knn = 4, decay = 15, t=12)
plot_grid(DimPlot(all.seurat, reduction = "umap",  group.by = "orig.ident", cols = c("#9E0142","#F98E52","#FFFFBE","#86CFA5","#5E4FA2")),
          DimPlot(all.seurat, reduction = "tsne",  group.by = "orig.ident", cols = c("#9E0142","#F98E52","#FFFFBE","#86CFA5","#5E4FA2")),
          DimPlot(all.seurat, reduction = "phate", group.by = "orig.ident", cols = c("#9E0142","#F98E52","#FFFFBE","#86CFA5","#5E4FA2")),
          ncol = 3)

## the result looks nothing like expected.................

And here is my code to run PHATE manually:

library(phateR)
library(Rmagic)

## phateR need the python package
library(reticulate)
use_condaenv(condaenv = "for_phate", conda = "/home/qiuhui/.conda/envs/py36/bin/conda", required = T)
reticulate::py_discover_config("phate")
reticulate::import("phate")
reticulate::import("magic")

library(ggplot2)
library(readr)
library(viridis)
library(cowplot)

##### still, need seurat to read 10X results ... #####
library("Seurat", lib.loc="~/R/test_packages_2/")

## input 10X results, then filter cells by library size quantile
input_and_filter_lib_quantile <- function(file_path){
  T1 <- Read10X(file_path,unique.features = T)
  lib.size <- colSums(T1)
  quantiles <- quantile(lib.size, probs = c(0.2,0.8))
  lib.size.filtered <- which( lib.size > quantiles[1] & lib.size < quantiles[2] )
  return(T1[,names(lib.size.filtered)])
}

T1 <- input_and_filter_lib_quantile("./EB_differentiation_data/T0_1A/")
T2 <- input_and_filter_lib_quantile("./EB_differentiation_data/T2_3B/")
T3 <- input_and_filter_lib_quantile("./EB_differentiation_data/T4_5C/")
T4 <- input_and_filter_lib_quantile("./EB_differentiation_data/T6_7D/")
T5 <- input_and_filter_lib_quantile("./EB_differentiation_data/T8_9E/")

all(rownames(T1) == rownames(T5))  ## all gene names are the same
sum(duplicated(rownames(T1)))      ## no duplicated gene name

## transpose, rows are cells !!!!
all.matrix <- t(cbind(T1,T2,T3,T4,T5))
dim(all.matrix)  # 18691 cells, 33694 genes

## filter dead cells (high MT gene expression)
mito.genes <- grep(pattern = "^MT", x = colnames(all.matrix), value = T)
cells.percent.mito <- rowSums(all.matrix[,mito.genes])/rowSums(all.matrix)
all.matrix <- all.matrix[which(cells.percent.mito < quantile(cells.percent.mito,0.9)),]
dim(all.matrix)  # 16821 cells, 33694 genes

cell_date <- c(rep("T1",ncol(T1)), rep("T2",ncol(T2)), rep("T3",ncol(T3)), rep("T4",ncol(T4)), rep("T5",ncol(T5)))
names(cell_date) <- c(colnames(T1), colnames(T2), colnames(T3), colnames(T4), colnames(T5))
cell_date <- cell_date[rownames(all.matrix)]

## filter genes
all.matrix <- all.matrix[,colSums( all.matrix > 0 ) >10]
dim(all.matrix)  # 16821 cells, 17409 genes

## sqrt normalize
all.data <- library.size.normalize(all.matrix)
all.data <- sqrt(all.data)

## run PHATE
all.PHATE <- phate(all.data)

ggplot(all.PHATE) + geom_point(aes(PHATE1, PHATE2, color=cell_date),size = 0.1) +
  scale_color_manual(values = c("#9E0142","#F98E52","#FFFFBE","#86CFA5","#5E4FA2"))

################# looks good !!!!!! #################

# rerun PHATE with new parameters
all.PHATE <- phate(all.data, knn=4, decay=15, t=12, init=all.PHATE)

ggplot(all.PHATE) + geom_point(aes(PHATE1, PHATE2, color=cell_date),size = 0.1) +
  scale_color_manual(values = c("#9E0142","#F98E52","#FFFFBE","#86CFA5","#5E4FA2")) + labs(color="diff_day")
scottgigante commented 4 years ago

Without having run this myself and looked in detail, the first thing that jumps out at me is that Seurat does not use the sqrt transformation. What do the results look like for the two different PHATE plots? If you apply a log transform in the second version in place of the sqrt transform, what does it look like then?

jsnagai commented 1 year ago

@qiuhuiqq Did you feed the PCA matrix to PHATE instead of the raw count matrix? When i did it in my end, i could retrieve pretty close results compared to RunPHATE.