sunduanchen / Scissor

Scissor package
GNU General Public License v3.0
191 stars 33 forks source link

Too many selected cells from Scissor #69

Open zxs132 opened 6 days ago

zxs132 commented 6 days ago

Hello, I am trying to replicate your simulation described in the manuscript.

I generated single cell data using Splatter with the following parameters: parameters <- setParams(parameters, update = list(nGenes = 5000, batchCells = 1000, group.prob = c(0.8, 0.1, 0.1), de.prob = c(0.1, 0.01, 0.01), seed = 1999))

This is identical to the description in the manuscript, and I assigned group 1 to shared phenotype cells, group 2 to phenotype 1, and group 3 to phenotype 2 and drew UMAP similar to Figure 1f.

However, when I was running the Scissor with the generated data, I had to make a binary variable indicating whether each cell is specifically assigned to one phenotype or shared by both phenotypes.

To do that, I generated a binary variable indicating group 1 cells (largest subpopulation) as 0 and group 2 and group 3 cells as 1 since group 2 and group 3 cells are unique to phenotype 1 and 2, respectively.

I could run Scissor afterwards, but when I drew UMAP and labeled the results, it said "Scissor identified 801 Scissor+ cells and 141 Scissor- cells." "The percentage of selected cell is: 94.200%" which is way more than I expected and what was shown in the Figure 1g.

May I know which part I should modify? I pasted the codes under here

Thank you.

library(splatter)

parameters <- newSplatParams() parameters

Set parameters as in the Scissor paper

parameters <- setParams(parameters, update = list(nGenes = 5000, batchCells = 1000, group.prob = c(0.8, 0.1, 0.1), de.prob = c(0.1, 0.01, 0.01), seed = 1999))

Data generation

set.seed(1999) sim <- splatSimulate(parameters, method = "groups")

Assign cells to phenotypes

Assume group 1 is common to both phenotypes, and groups 2 and 3 are unique

group_assignments <- sim$Group phenotype_1_cells <- group_assignments == "Group1" | group_assignments == "Group2" phenotype_2_cells <- group_assignments == "Group1" | group_assignments == "Group3" sim$Celltype <- factor(group_assignments, levels = c("Group1", "Group2", "Group3"), labels = c("Cell type 1", "Cell type 2", "Cell type 3")) sim$Phenotype <- factor(group_assignments, levels = c("Group1", "Group2", "Group3"), labels = c(paste0("Shared cells (", table(sim$Celltype)[[1]], ")"), paste0("Phenotype 1 cells (", table(sim$Celltype)[[2]], ")"), paste0("Phenotype 2 cells (", table(sim$Celltype)[[3]], ")"))) table(sim$Phenotype)

Subset the data for phenotype-specific cells

sim_data_1 <- sim@assays@data$counts[, phenotype_1_cells] sim_data_2 <- sim@assays@data$counts[, phenotype_2_cells]

Create bulk profiles by averaging expression over 50 samples per phenotype

sample_bulk <- function(data, n_samples = 50) { sampled_data <- replicate(n_samples, { sample_indices <- sample(1:ncol(data), ncol(data), replace = TRUE) rowMeans(data[, sample_indices]) }) return(sampled_data) }

Generating 50 bulk samples based on generated cells

bulk_1 <- sample_bulk(sim_data_1, n_samples = 50) bulk_2 <- sample_bulk(sim_data_2, n_samples = 50) bulk_sim <- as.matrix(cbind(bulk_1, bulk_2))

seurat_sim <- as.Seurat(sim, counts = "counts", data = "counts")

library(Scissor) library(Seurat) seurat_sim <- Seurat_preprocessing(sim@assays@data$counts, verbose = F) seurat_sim$Celltype <- sim$Celltype seurat_sim$Phenotype <- factor(sim$Phenotype, levels = c(paste0("Phenotype 1 cells (", table(sim$Celltype)[[2]], ")"), paste0("Phenotype 2 cells (", table(sim$Celltype)[[3]], ")"), paste0("Shared cells (", table(sim$Celltype)[[1]], ")"))) seurat_sim$Phenotype_binary <- as.numeric(ifelse(sim$Phenotype %in% c(paste0("Phenotype 1 cells (", table(sim$Celltype)[[2]], ")"), paste0("Phenotype 2 cells (", table(sim$Celltype)[[3]], ")")), 1, 0))

Modified Scissor slightly (Data accessing between @ and $)

Scissor_modified <- function(bulk_dataset, sc_dataset, phenotype, tag = NULL, alpha = NULL, cutoff = 0.2, family = c("gaussian", "binomial", "cox"), Save_file = "Scissor_inputs.RData", Load_file = NULL) { library(Seurat) library(Matrix) library(preprocessCore) if (is.null(Load_file)) { common <- intersect(rownames(bulk_dataset), rownames(sc_dataset)) if (length(common) == 0) { stop("There is no common genes between the given single-cell and bulk samples.") } if (class(sc_dataset) == "Seurat") { sc_exprs <- as.matrix(sc_dataset@assays$RNA$data) network <- as.matrix(sc_dataset@graphs$RNA_snn) } else { sc_exprs <- as.matrix(sc_dataset) Seurat_tmp <- CreateSeuratObject(sc_dataset) Seurat_tmp <- FindVariableFeatures(Seurat_tmp, selection.method = "vst", verbose = F) Seurat_tmp <- ScaleData(Seurat_tmp, verbose = F) Seurat_tmp <- RunPCA(Seurat_tmp, features = VariableFeatures(Seurat_tmp), verbose = F) Seurat_tmp <- FindNeighbors(Seurat_tmp, dims = 1:10, verbose = F) network <- as.matrix(Seurat_tmp@graphs$RNA_snn) } diag(network) <- 0 network[which(network != 0)] <- 1 dataset0 <- cbind(bulk_dataset[common, ], sc_exprs[common, ]) dataset1 <- normalize.quantiles(dataset0) rownames(dataset1) <- rownames(dataset0) colnames(dataset1) <- colnames(dataset0) Expression_bulk <- dataset1[, 1:ncol(bulk_dataset)] Expression_cell <- dataset1[, (ncol(bulk_dataset) + 1):ncol(dataset1)] X <- cor(Expression_bulk, Expression_cell) quality_check <- quantile(X) print("|**|") print("Performing quality-check for the correlations") print("The five-number summary of correlations:") print(quality_check) print("|**|") if (quality_check[3] < 0.01) { warning("The median correlation between the single-cell and bulk samples is relatively low.") } if (family == "binomial") { Y <- as.numeric(phenotype) z <- table(Y) if (length(z) != length(tag)) { stop("The length differs between tags and phenotypes. Please check Scissor inputs and selected regression type.") } else { print(sprintf("Current phenotype contains %d %s and %d %s samples.", z[1], tag[1], z[2], tag[2])) print("Perform logistic regression on the given phenotypes:") } } if (family == "gaussian") { Y <- as.numeric(phenotype) z <- table(Y) if (length(z) != length(tag)) { stop("The length differs between tags and phenotypes. Please check Scissor inputs and selected regression type.") } else { tmp <- paste(z, tag) print(paste0("Current phenotype contains ", paste(tmp[1:(length(z) - 1)], collapse = ", "), ", and ", tmp[length(z)], " samples.")) print("Perform linear regression on the given phenotypes:") } } if (family == "cox") { Y <- as.matrix(phenotype) if (ncol(Y) != 2) { stop("The size of survival data is wrong. Please check Scissor inputs and selected regression type.") } else { print("Perform cox regression on the given clinical outcomes:") } } save(X, Y, network, Expression_bulk, Expression_cell, file = Save_file) } else { load(Load_file) } if (is.null(alpha)) { alpha <- c(0.005, 0.01, 0.05, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9) } for (i in 1:length(alpha)) { set.seed(123) fit0 <- APML1(X, Y, family = family, penalty = "Net", alpha = alpha[i], Omega = network, nlambda = 100, nfolds = min(10, nrow(X))) fit1 <- APML1(X, Y, family = family, penalty = "Net", alpha = alpha[i], Omega = network, lambda = fit0$lambda.min) if (family == "binomial") { Coefs <- as.numeric(fit1$Beta[2:(ncol(X) + 1)]) } else { Coefs <- as.numeric(fit1$Beta) } Cell1 <- colnames(X)[which(Coefs > 0)] Cell2 <- colnames(X)[which(Coefs < 0)] percentage <- (length(Cell1) + length(Cell2))/ncol(X) print(sprintf("alpha = %s", alpha[i])) print(sprintf("Scissor identified %d Scissor+ cells and %d Scissor- cells.", length(Cell1), length(Cell2))) print(sprintf("The percentage of selected cell is: %s%%", formatC(percentage * 100, format = "f", digits = 3))) if (percentage < cutoff) { break } cat("\n") } print("|**|") return(list(para = list(alpha = alpha[i], lambda = fit0$lambda.min, family = family), Coefs = Coefs, Scissor_pos = Cell1, Scissor_neg = Cell2)) }

Visualizing generated data for simulation

library(ggplot2) DimPlot(seurat_sim, reduction = 'umap', label = FALSE, group.by = "Phenotype") + ggtitle("Simuation ground truth") + theme(legend.position = "right", legend.text = element_text(size = 10))

sim_tag <- c('No phenotype', 'Phenotype') start <- Sys.time() scissor_res_simul1 <- Scissor_modified(bulk_sim, seurat_sim, seurat_sim$Phenotype_binary, tag = sim_tag, alpha = 0.5, family = "binomial", Save_file = 'Scissor_sim_binary.RData', cutoff = 0.3) end <- Sys.time() (time_elapsed <- end - start)

Scissor_select <- rep(0, ncol(seurat_sim)) names(Scissor_select) <- colnames(seurat_sim) Scissor_select[Scissor_select == 0] <- "Background cells" Scissor_select[scissor_res_simul1$Scissor_pos] <- "Scissor + cells" Scissor_select[scissor_res_simul1$Scissor_neg] <- "Scissor - cells" sc_sim_dataset <- AddMetaData(seurat_sim, metadata = Scissor_select, col.name = "scissor") library(ggplot2) DimPlot(sc_sim_dataset, reduction = 'umap', group.by = 'scissor', cols = c('grey', 'royalblue', 'indianred1'), pt.size = 1.2, order = c("Scissor + cells", "Scissor - cells", "Background cells")) + ggtitle("Phenotype-associated cell subpopulation")

zxs132 commented 6 days ago

Or if it is possible, I would love to look at the code you used for simulation.

zxs132 commented 6 days ago

thanks