statOmics / tradeSeq

TRAjectory-based Differential Expression analysis for SEQuencing data
Other
235 stars 27 forks source link

DEG from tradeseq #260

Open synatkeamsk opened 3 months ago

synatkeamsk commented 3 months ago

Dear Authors,

I used tradeSeq for my monocle3 object to find DEG of various lineage along trajectory. Output provides gene names, waldStat, pvalue and meanLogFC.

This is my first time doing tradeSeq. Therefore, I am wondering whether p-value here referered ajust-pvalue? How to know which genes are upregulated and downregulated. Less than 1 is down or greater than 1 is up? because I did not see any negative meanLogFC in my tradeSeq output at all. Hope you could help review and give me some answers.

BiocManager::install(c("tradeSeq", "SingleCellExperiment", "slingshot"))
library(tradeSeq)
library(SingleCellExperiment)
library(slingshot)

# Extract pseudotime and assume uniform cell weights
pseudotime <- as.numeric(pseudotime(cds_second))
cellWeights <- matrix(1, ncol = 1, nrow = length(pseudotime))

# Create SingleCellExperiment object
sce <- SingleCellExperiment(assays = list(counts = counts(cds_second)))
colData(sce)$pseudotime <- pseudotime
colData(sce)$cellWeights <- cellWeights

# Fit the GAM
sce <- fitGAM(counts = counts(sce), pseudotime = pseudotime, cellWeights = cellWeights)

# Perform differential expression testing as a function of pseudotime! 
diffExpr <- associationTest(sce)
diffExpr_omitna<- na.omit(diffExpr)
significant_genes <- rownames(diffExpr)[which(diffExpr$pvalue < 0.05)]

#filter significant genes 
second_deg_pseudo<- diffExpr_omitna %>% filter(pvalue < 0.05)
write.csv(second_deg_pseudo, file = "second_deg_pseudo.csv")

# View significant genes
print(significant_genes)

Kind Regards,

Synat TradeSeq

synatkeamsk commented 2 months ago

Dear developers, @HectorRDB @jwokaty @nturaga @sandrinedudoit ,

Wondering whether anyone of you could help answer my question regarding tradeseq output in the attach!

I recently adapted my code based on your vignettes:

#first step 
cds_second.cd4 <- cluster_cells(cds_second.cd4, 
                     reduction_method = "UMAP", 
                     resolution = 1e-2)  #fit with Seurat !

# First display, coloring by the cell types from Paul et al
plot_cells(cds_second.cd4, 
           label_groups_by_cluster = FALSE, 
           cell_size = 1,
           color_cells_by = "ident")
# Construct the graph
# Note that, for the rest of the code to run, the graph should be fully connected
cds_second.cd4 <- learn_graph(cds_second.cd4, use_partition = FALSE)
# We find all the cells that are close to the starting point
cell_ids <- colnames(cds_second.cd4)[second_arthritis.cd4$seurat_clusters ==  "3"]
closest_vertex <- cds_second.cd4@principal_graph_aux[["UMAP"]]$pr_graph_cell_proj_closest_vertex
closest_vertex <- as.matrix(closest_vertex[colnames(cds_second.cd4), ])
closest_vertex <- closest_vertex[cell_ids, ]
closest_vertex <- as.numeric(names(which.max(table(closest_vertex))))
mst <- principal_graph(cds)$UMAP
root_pr_nodes <- igraph::V(mst)$name[closest_vertex]

# We compute the trajectory
cds_second.cd4 <- order_cells(cds_second.cd4, root_pr_nodes = root_pr_nodes)
plot_cells(cds_second.cd4, color_cells_by = "pseudotime")

################################################################################
library(magrittr)
# Get the closest vertice for every cell
y_to_cells <-  principal_graph_aux(cds_second.cd4)$UMAP$pr_graph_cell_proj_closest_vertex %>%
  as.data.frame()
y_to_cells$cells <- rownames(y_to_cells)
y_to_cells$Y <- y_to_cells$V1

# Get the root vertices
# It is the same node as above
root <- cds_second.cd4@principal_graph_aux$UMAP$root_pr_nodes

# Get the other endpoints
endpoints <- names(which(igraph::degree(mst) == 1))
endpoints <- endpoints[!endpoints %in% root]

# For each endpoint
cellWeights <- lapply(endpoints, function(endpoint) {
  # We find the path between the endpoint and the root
  path <- igraph::shortest_paths(mst, root, endpoint)$vpath[[1]]
  path <- as.character(path)
  # We find the cells that map along that path
  df <- y_to_cells[y_to_cells$Y %in% path, ]
  df <- data.frame(weights = as.numeric(colnames(cds_second.cd4) %in% df$cells))
  colnames(df) <- endpoint
  return(df)
  }) %>% do.call(what = 'cbind', args = .) %>%
    as.matrix()
rownames(cellWeights) <- colnames(cds_second.cd4)
pseudotime <- matrix(pseudotime(cds_second.cd4), ncol = ncol(cellWeights),
                     nrow = ncol(cds_second.cd4), byrow = FALSE)

# Check for cells with no weights   ==== use it in case of no positive cell weight
if (any(rowSums(cellWeights) == 0)) {
  stop("Some cells have no positive cell weights.")
}

# Exclude cells with no positive weights
valid_cells <- rowSums(cellWeights) > 0
cellWeights <- cellWeights[valid_cells, ]
pseudotime <- pseudotime[valid_cells, ]

# Convert counts to sparse matrix for memory efficiency
counts <- as(counts(cds_second.cd4), "dgCMatrix") # good code 

#fit GAM ! 
sce <- fitGAM(counts = counts,
                pseudotime = pseudotime,
                cellWeights = cellWeights,
                nknots = 5)

I understand you are very busy and appreciate if you could help!

Kind Regards,

Synat