raymondlouie / MiniMarS

4 stars 2 forks source link

Error with dataset 4 #23

Closed anglixue closed 1 year ago

anglixue commented 1 year ago

Hi Team, I've run the latest version and got an error at the final step of performanceAllMarkers()

# devtools::install_github("raymondlouie/ClusterMarkers", ref = "Dev")

library(ClusterMarkers)
library(dplyr)
library(SingleCellExperiment)
library(Seurat)

# Use 10X 10K PBMC Total-seqB samples
data <- Read10X("~/Downloads/filtered_feature_bc_matrix/")
seurat_object = CreateSeuratObject(counts = data$`Gene Expression`)
seurat_object[['Protein']] = CreateAssayObject(counts = data$`Antibody Capture`)

# read clustering info
clst=read.csv("~/Downloads/analysis/clustering/graphclust/clusters.csv",header=T)
# check UMI
all(colnames(seurat_object)==clst$Barcode)
# update meta info
seurat_object@meta.data$Cluster = clst$Cluster

# This doesn't need to be transposed because the count matrix is already in Seurat's acceptable dimension
input_matrix = seurat_object@assays$Protein@counts
clusters = seurat_object@meta.data$Cluster

# Seurat input example.

sc_object = CreateSeuratObject(input_matrix, assay = "Protein")
Idents(object = sc_object) <- clusters
seurat_in = processInputFormat(sc_object=sc_object,
                               verbose=TRUE)

# Select clusters
clusters_sel = 1:11
sc_in = seurat_in # As an example, select the SCE input

cluster_selection_out= processClusterSelection(sc_in,
                                               clusters_sel=clusters_sel,
                                               verbose=TRUE)
# sub sample

final_out = processSubsampling(cluster_selection_out,
                               clusters_sel="all_clusters",
                               subsample_num=1000,
                               train_test_ratio = 0.9,
                               cluster_proportion= "proportional",
                               verbose=TRUE)

# Main
list_markers = findClusterMarkers(final_out$training_matrix,
                                  final_out$training_clusters,
                                  num_markers=15,
                                  method="all",
                                  verbose=TRUE)

# evaluate performance
list_performance = performanceAllMarkers(list_markers,
                                         final_out=final_out,
                                         method="all",
                                         nrounds=1500,
                                         nthread=6,
                                         verbose=TRUE)

The error message goes Error in .check_argument_correct(dots, "genes.selection", is.character, : Check genes.selection - should be character vector

raymondlouie commented 1 year ago

Yep - if you use the new code in the readme, the error will be fixed. The error occured because there is a new item in list_markers containing the new timing information Hsiao-Chi made. The new code will seperate this out.

anglixue commented 1 year ago

These function have been run without errors using the new code.

> list_markers_time
$citeFuse
 [1] "CD45RO-TotalSeqB"       "CD45RA-TotalSeqB"       "CD3-TotalSeqB"          "CD4-TotalSeqB"         
 [5] "CD14-TotalSeqB"         "CD127-TotalSeqB"        "CD56-TotalSeqB"         "TIGIT-TotalSeqB"       
 [9] "CD16-TotalSeqB"         "CD19-TotalSeqB"         "CD8a-TotalSeqB"         "CD25-TotalSeqB"        
[13] "PD-1-TotalSeqB"         "CD15-TotalSeqB"         "IgG1-control-TotalSeqB"

$sc2marker
 [1] "IgG2b-control-TotalSeqB" "PD-1-TotalSeqB"          "TIGIT-TotalSeqB"         "CD127-TotalSeqB"        
 [5] "IgG1-control-TotalSeqB"  "CD14-TotalSeqB"          "CD25-TotalSeqB"          "IgG2a-control-TotalSeqB"
 [9] "CD19-TotalSeqB"          "CD56-TotalSeqB"          "CD4-TotalSeqB"          

$geneBasis
 [1] "CD8a-TotalSeqB" "CD16-TotalSeqB" "CD4-TotalSeqB"  "CD19-TotalSeqB" "CD56-TotalSeqB" "PD-1-TotalSeqB"
 [7] "CD25-TotalSeqB" NA               NA               NA               NA               NA              
[13] NA               NA               NA              

$xgBoost
 [1] "CD14-TotalSeqB"          "CD45RO-TotalSeqB"        "CD3-TotalSeqB"           "CD19-TotalSeqB"         
 [5] "CD4-TotalSeqB"           "CD56-TotalSeqB"          "CD45RA-TotalSeqB"        "CD127-TotalSeqB"        
 [9] "CD16-TotalSeqB"          "TIGIT-TotalSeqB"         "CD25-TotalSeqB"          "CD8a-TotalSeqB"         
[13] "PD-1-TotalSeqB"          "CD15-TotalSeqB"          "IgG2b-control-TotalSeqB"

$consensus
 [1] "CD19-TotalSeqB"         "CD25-TotalSeqB"         "CD4-TotalSeqB"          "CD56-TotalSeqB"        
 [5] "PD-1-TotalSeqB"         "CD127-TotalSeqB"        "CD14-TotalSeqB"         "CD16-TotalSeqB"        
 [9] "CD8a-TotalSeqB"         "TIGIT-TotalSeqB"        "CD15-TotalSeqB"         "CD3-TotalSeqB"         
[13] "CD45RA-TotalSeqB"       "CD45RO-TotalSeqB"       "IgG1-control-TotalSeqB"

$runtime_secs
 citeFuse sc2marker geneBasis   xgBoost 
 5.095403  4.500375  1.762664  5.292093 

and the performance

> list_performance
$citeFuse
$citeFuse$xgBoost_performance
   cluster        TP
1        5 0.5555556
2       11 0.0000000
3        3 0.8333333
4        7 0.7142857
5        1 1.0000000
6        2 0.9411765
7        6 0.8750000
8       10 0.7500000
9        4 0.5454545
10       8 0.6666667
11       9 0.5000000

$citeFuse$geneBasis_performance
   cluster        TP
1        5 0.2222222
2       11 0.0000000
3        3 0.7500000
4        7 0.4285714
5        1 1.0000000
6        2 0.8823529
7        6 0.7500000
8       10 0.0000000
9        4 0.3636364
10       8 0.3333333
11       9 0.0000000

$sc2marker
$sc2marker$xgBoost_performance
   cluster        TP
1        5 0.4444444
2       11 0.0000000
3        3 0.7500000
4        7 0.7142857
5        1 0.8571429
6        2 0.8823529
7        6 0.8750000
8       10 0.2500000
9        4 0.5454545
10       8 0.5000000
11       9 0.5000000

$sc2marker$geneBasis_performance
   cluster        TP
1        5 0.2222222
2       11 0.0000000
3        3 0.7500000
4        7 0.1428571
5        1 0.9523810
6        2 0.5882353
7        6 0.8750000
8       10 0.2500000
9        4 0.5454545
10       8 0.3333333
11       9 0.5000000

$geneBasis
$geneBasis$xgBoost_performance
   cluster        TP
1        5 0.6666667
2       11 0.0000000
3        3 0.8333333
4        7 0.7142857
5        1 0.7142857
6        2 0.7647059
7        6 0.8750000
8       10 0.0000000
9        4 0.3636364
10       8 0.5000000
11       9 0.0000000

$geneBasis$geneBasis_performance
   cluster        TP
1        5 0.2222222
2       11 0.0000000
3        3 0.6666667
4        7 0.2857143
5        1 0.7619048
6        2 0.5294118
7        6 1.0000000
8       10 0.0000000
9        4 0.4545455
10       8 0.0000000
11       9 0.0000000

$xgBoost
$xgBoost$xgBoost_performance
   cluster        TP
1        5 0.6666667
2       11 0.0000000
3        3 0.8333333
4        7 0.7142857
5        1 1.0000000
6        2 0.8823529
7        6 0.8750000
8       10 0.0000000
9        4 0.4545455
10       8 0.6666667
11       9 0.5000000

$xgBoost$geneBasis_performance
   cluster        TP
1        5 0.2222222
2       11 0.0000000
3        3 0.7500000
4        7 0.4285714
5        1 1.0000000
6        2 0.8823529
7        6 0.7500000
8       10 0.0000000
9        4 0.3636364
10       8 0.3333333
11       9 0.0000000

$consensus
$consensus$xgBoost_performance
   cluster        TP
1        5 0.5555556
2       11 0.0000000
3        3 0.8333333
4        7 0.7142857
5        1 1.0000000
6        2 1.0000000
7        6 0.8750000
8       10 0.5000000
9        4 0.5454545
10       8 0.6666667
11       9 0.5000000

$consensus$geneBasis_performance
   cluster        TP
1        5 0.2222222
2       11 0.0000000
3        3 0.7500000
4        7 0.4285714
5        1 1.0000000
6        2 0.8823529
7        6 0.7500000
8       10 0.0000000
9        4 0.3636364
10       8 0.3333333
11       9 0.0000000

But the plot functions are not due to the inconsistent number of the markers

> library(ggplot2)
> plotMarkers(list_markers)
Error in (function (..., row.names = NULL, check.rows = FALSE, check.names = TRUE,  : 
  arguments imply differing number of rows: 15, 11
> plotPerformance(list_performance)
Error in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y,  : 
  polygon edge not found
In addition: There were 12 warnings (use warnings() to see them)

Have you met this with other datasets? I also tried the built-in sce object. The first plot function is fine but the second showed the same error

raymondlouie commented 1 year ago

Thanks Angli - did you use the latest version of plotPerformance? I got the same error with a previous version, but it should've been fixed.

Essentially, the new code in plotPerformanceis:

`

for (i in seq(1,dim(performance_df)[2],2)){
        curr_df = performance_df[,c(i,i+1)]
        tempSplit = unlist(lapply(as.character(colnames(curr_df)),
                                  function (x) strsplit(x,split="[.]")[[1]]))
        colnames(curr_df) = c("Clusters","TP")
        curr_df$marker_method=tempSplit[[1]]
        curr_df$performance_method=gsub("_performance",
                                        "",
                                        tempSplit[[2]])
        curr_df$TP
        if (i==1){
            performance_plot_df = curr_df
        } else{
            performance_plot_df = rbind(performance_plot_df,curr_df)
        }

    }

`

anglixue commented 1 year ago

Hi Ray,

I used the latest version.

Which dataset did you test? Maybe it's because we used different datasets?

raymondlouie commented 1 year ago

I tried the SCE example dataset (I think you mentioned you used this, but I might of misunderstood your message). I also tried the datasets Dhruti and Hsiao-Chi originally had trouble with. Do you mind letting me know which datasets caused the error, so I can reproduce it and try to fix it?

anglixue commented 1 year ago

I had errors with both the SCE example dataset and the public dataset 4. Using the SCE example dataset I can do the first plot but not the second. Using public dataset 4 neither plot function is working.

anglixue commented 1 year ago

Hi Ray, Could you share the two example plots in Dropbox? I'll have a look and find the root of the error from my end.

anglixue commented 1 year ago

I changed the line 108 in the wrapperPlots.R from axis.text.x = element_text(angle=45,hjust=1,text_size), to axis.text.x = element_text(angle=45,hjust=1,size=text_size), Then the error is fixed

I believe the element_text() function recognized the wrong flag for text_size

raymondlouie commented 1 year ago

Awesome, thanks Angli!

raymondlouie commented 1 year ago

I just made your change to main.