compbiomed / singleCellTK

Interactively analyze single cell genomic data
https://camplab.net/sctk/
Other
170 stars 74 forks source link

setRowNames affects UMAP Clustering #752

Open MrModenait opened 2 months ago

MrModenait commented 2 months ago

I have been using singleCellTK to import some CellRanger output into R for downstream analysis with DecontX and Seurat, following this vignette. I am aware that is it possible to either first run decontX on a SingleCellExperiment object and then create the Seurat object, or it is possible to create the Seurat object and then run decontX on the counts slot. I decided to begin with the former as it suited my needs the most. When importing data using the importCellRanger function, my matrix is read in with Ensembl gene IDs instead of gene symbols. I would like to convert the Ensemble gene ID to gene symbols for easier visualisation before I generate my Seurat object. However, after a lot of trial and error I have realised that the setRowNames is interfering with my UMAP clustering. In the image below, the first row illustrates the effetct of running decontXwithout renaming the genes. In the second row, I have replaced the GeneIDs with a unique number from 1 to 36,601. I tried to number both before and after applying decontX, without any impact on the data. In the third row, I have applied setRowNames with dedup set to TRUE. However, this notably changes the shape of the data of my UMAP plot. What could be a possible cause for this? I am including my code at the bottom of this post, if it should interest anybody.

DecontX Experiment

# Set start time
  start.time <- Sys.time()
# Import the filtered matrix using singleCellTK
  filtered_matrix <- importCellRanger(sampleDirs = "../Raw Files/2023 Sequencing/PDX2 #5 (Bone Marrow, CH)/",
                                         dataType = "filtered")#
# Import the raw matrix using singleCellTK
  raw_matrix <- importCellRanger(sampleDirs = "../Raw Files/2023 Sequencing/PDX2 #5 (Bone Marrow, CH)/",
                                    dataType = "raw")
# View cell names
  head(rownames(filtered_matrix))
"ENSG00000243485" "ENSG00000237613" "ENSG00000186092" "ENSG00000238009" "ENSG00000239945" "ENSG00000239906"
# Set cutoffs for Seurat
  minimum_cells_per_feature <- 10
  minimum_features_per_cell <- 100

### Create Seurat Objects ###
# Dataset 1: No DecontX, No Rename
  matrix <- filtered_matrix
  dataset1 <- CreateSeuratObject(counts = counts(matrix), project = "CHPDX2_5_BM",
                                      min.cells = minimum_cells_per_feature, min.features = minimum_features_per_cell)
# Dataset 2: No DecontX, Number Gene Names 
  matrix <- filtered_matrix
  rownames(matrix) <- 1:length(rownames(matrix))
  dataset2 <- CreateSeuratObject(counts = counts(matrix), project = "CHPDX2_5_BM",
                                 min.cells = minimum_cells_per_feature, min.features = minimum_features_per_cell)
# Dataset 3: No DecontX, SetRowNames
  matrix <- filtered_matrix
  matrix <- singleCellTK::setRowNames(matrix,rowNames = "feature_name",dedup = TRUE)
  dataset3 <- CreateSeuratObject(counts = counts(matrix), project = "CHPDX2_5_BM",
                                 min.cells = minimum_cells_per_feature, min.features = minimum_features_per_cell)
# Dataset 4: DecontX, No Rename
  matrix <- filtered_matrix
  matrix <- decontX(matrix,background = raw_matrix)
  dataset4 <- CreateSeuratObject(counts = round(decontXcounts(matrix)), project = "CHPDX2_5_BM",
                                 min.cells = minimum_cells_per_feature, min.features = minimum_features_per_cell)
# Dataset 5: DecontX, Number Gene Names Rename Before
  matrix <- filtered_matrix
  rownames(matrix) <- 1:length(rownames(matrix))
  matrix <- decontX(matrix,background = raw_matrix)
  dataset5 <- CreateSeuratObject(counts = round(decontXcounts(matrix)), project = "CHPDX2_5_BM",
                                 min.cells = minimum_cells_per_feature, min.features = minimum_features_per_cell)
# Dataset 6: DecontX, Number Gene Names Rename After
  matrix <- filtered_matrix
  matrix <- decontX(matrix,background = raw_matrix)
  rownames(matrix) <- 1:length(rownames(matrix))
  dataset6 <- CreateSeuratObject(counts = round(decontXcounts(matrix)), project = "CHPDX2_5_BM",
                                 min.cells = minimum_cells_per_feature, min.features = minimum_features_per_cell)
# Dataset 7: DecontX, SetRowNames Before
  matrix <- filtered_matrix
  matrix <- singleCellTK::setRowNames(matrix,rowNames = "feature_name",dedup = TRUE)
  matrix <- decontX(matrix,background = raw_matrix)
  dataset7 <- CreateSeuratObject(counts = round(decontXcounts(matrix)), project = "CHPDX2_5_BM",
                                 min.cells = minimum_cells_per_feature, min.features = minimum_features_per_cell)
# Dataset 8: DecontX, SetRowNames After
  matrix <- filtered_matrix
  matrix <- decontX(matrix,background = raw_matrix)
  matrix <- singleCellTK::setRowNames(matrix,rowNames = "feature_name",dedup = TRUE)
  dataset8 <- CreateSeuratObject(counts = round(decontXcounts(matrix)), project = "CHPDX2_5_BM",
                                 min.cells = minimum_cells_per_feature, min.features = minimum_features_per_cell)

# List the datasets
  dataset_list <- list(dataset1,dataset2,dataset3,dataset4,dataset5,dataset6,dataset7,dataset8)
  names(dataset_list) <- c("NoDecontX_NoRename","NoDecontX_Num","NoDecontX_SetRowName",
                           "DecontX_NoRename","DecontX_NumBefore","DecontX_NumAfter","DecontX_SetRowNamesBefore","DecontX_SetRowNamesAfter")

# Apply analysis
  dimensions_to_use <- 30
  object_list <- lapply(dataset_list, function(diagnostic_object) {
    # Calculate mitochondrial features
      diagnostic_object <- PercentageFeatureSet(diagnostic_object, pattern = "^MT-", col.name = "percent.mt")
    # run sctransform
      diagnostic_object <- SCTransform(diagnostic_object, vars.to.regress = "percent.mt", verbose = FALSE,
                                       return.only.var.genes = FALSE)
    # Run PCA
      diagnostic_object <- RunPCA(diagnostic_object, verbose = FALSE,seed.use = 1999)
    # Run UMAP
      diagnostic_object <- RunUMAP(diagnostic_object, dims = 1:dimensions_to_use, verbose = FALSE,seed.use = 1999)
    # Find neighbors
      diagnostic_object <- FindNeighbors(diagnostic_object, dims = 1:dimensions_to_use, verbose = FALSE,seed.use = 1999)
    # Generate clusters
      diagnostic_object <- FindClusters(diagnostic_object, verbose = FALSE,seed.use = 1999)
    # Return object
      diagnostic_object
  })

# Generate UMAP plots
  UMAP_plot_list <- lapply(seq_along(object_list), function(i) {
    # View UMAP
      DimPlot(object_list[[i]], label = TRUE,label.box = TRUE) + 
        labs(x = "UMAP 1", y = "UMAP 2",
             title = names(dataset_list)[[i]]) +
        theme(axis.title = element_text(size = 12),
              legend.position = "none", plot.title = element_text(hjust = 0.5))
  })
  names(UMAP_plot_list) <- names(dataset_list)
  library(patchwork)
  layout <- "
    #AABB#
    CCDDEE
    FFGGHH
    "
  patchwork::wrap_plots(UMAP_plot_list$NoDecontX_NoRename,UMAP_plot_list$DecontX_NoRename,
             UMAP_plot_list$NoDecontX_Num, UMAP_plot_list$DecontX_NumBefore, UMAP_plot_list$DecontX_NumAfter,
             UMAP_plot_list$NoDecontX_SetRowName, UMAP_plot_list$DecontX_SetRowNamesBefore, UMAP_plot_list$DecontX_SetRowNamesAfter,
             design = layout)
# Calculate time taken
  end.time <- Sys.time()
  time.taken <- round(end.time - start.time,2)
  time.taken