kassambara / rstatix

Pipe-friendly Framework for Basic Statistical Tests in R
https://rpkgs.datanovia.com/rstatix/
444 stars 50 forks source link

rstatix::t_test() doesen't work with unused factor levels while stats::t.test() accommodates them. #133

Open caparks2 opened 3 years ago

caparks2 commented 3 years ago

In the following example, base R stats::t.test() works, but rstatix::t_test() doesn't. I think the reason for the error is because of unused factor levels.

# Import libraries.
library(tidyverse)
library(rstatix)
#> 
#> Attaching package: 'rstatix'
#> The following object is masked from 'package:stats':
#> 
#>     filter

# Example data to reproduce the error.
df <- tibble(
  patient_id = c(
    10, 19, 05, 02, 14, 15, 21, 12, 10, 19,
    05, 02, 14, 15, 21, 12, 10, 19, 05, 02,
    14, 15, 21, 12, 10, 19, 05, 02, 14, 15,
    21, 12, 10, 19, 05, 02, 14, 15, 21, 12,
    10, 19, 05, 02, 14, 15, 21, 12, 10, 19,
    05, 02, 14, 15, 21, 12, 10, 19, 05, 02,
    14, 15, 21, 12, 10, 19, 05, 02, 14, 15,
    21, 12, 10, 19, 05, 02, 14, 15, 21, 12,
    10, 19, 05, 02, 14, 15, 21, 12, 10, 19,
    05, 02, 14, 15, 21, 12, 10, 19, 05, 02,
    14, 15, 21, 12, 10, 19, 05, 02, 14, 15,
    21, 12, 10, 19, 05, 02, 14, 15, 21, 12,
    10, 19, 05, 02, 14, 15, 21, 12, 10, 19,
    05, 02, 14, 15, 21, 12, 10, 19, 05, 02,
    14, 15, 21, 12
  ),
  t_cell_subset = c(
    "CD4", "CD4", "CD4", "CD4", "CD4", "CD4",
    "CD4", "CD4", "CD4", "CD4", "CD4", "CD4",
    "CD4", "CD4", "CD4", "CD4", "CD8", "CD8",
    "CD8", "CD8", "CD8", "CD8", "CD8", "CD8",
    "CD8", "CD8", "CD8", "CD8", "CD8", "CD8",
    "CD8", "CD8", "CD4", "CD4", "CD4", "CD4",
    "CD4", "CD4", "CD4", "CD4", "CD4", "CD4",
    "CD4", "CD4", "CD4", "CD4", "CD4", "CD4",
    "CD8", "CD8", "CD8", "CD8", "CD8", "CD8",
    "CD8", "CD8", "CD8", "CD8", "CD8", "CD8",
    "CD8", "CD8", "CD8", "CD8", "CD4", "CD4",
    "CD4", "CD4", "CD4", "CD4", "CD4", "CD4",
    "CD4", "CD4", "CD4", "CD4", "CD4", "CD4",
    "CD4", "CD4", "CD8", "CD8", "CD8", "CD8",
    "CD8", "CD8", "CD8", "CD8", "CD8", "CD8",
    "CD8", "CD8", "CD8", "CD8", "CD8", "CD8",
    "CD4", "CD4", "CD4", "CD4", "CD4", "CD4",
    "CD4", "CD4", "CD4", "CD4", "CD4", "CD4",
    "CD4", "CD4", "CD4", "CD4", "CD4", "CD4",
    "CD4", "CD4", "CD4", "CD4", "CD4", "CD4",
    "CD8", "CD8", "CD8", "CD8", "CD8", "CD8",
    "CD8", "CD8", "CD8", "CD8", "CD8", "CD8",
    "CD8", "CD8", "CD8", "CD8", "CD8", "CD8",
    "CD8", "CD8", "CD8", "CD8", "CD8", "CD8"
  ),
  phenotype = c(
    "SP", "SP", "SP", "SP", "SP", "SP", "SP",
    "SP", "DN", "DN", "DN", "DN", "DN", "DN",
    "DN", "DN", "SP", "SP", "SP", "SP", "SP",
    "SP", "SP", "SP", "DN", "DN", "DN", "DN",
    "DN", "DN", "DN", "DN", "DP", "DP", "DP",
    "DP", "DP", "DP", "DP", "DP", "DN", "DN",
    "DN", "DN", "DN", "DN", "DN", "DN", "DP",
    "DP", "DP", "DP", "DP", "DP", "DP", "DP",
    "DN", "DN", "DN", "DN", "DN", "DN", "DN",
    "DN", "DP", "DP", "DP", "DP", "DP", "DP",
    "DP", "DP", "SP", "SP", "SP", "SP", "SP",
    "SP", "SP", "SP", "DP", "DP", "DP", "DP",
    "DP", "DP", "DP", "DP", "SP", "SP", "SP",
    "SP", "SP", "SP", "SP", "SP", "DP", "DP",
    "DP", "DP", "DP", "DP", "DP", "DP", "SP",
    "SP", "SP", "SP", "SP", "SP", "SP", "SP",
    "DN", "DN", "DN", "DN", "DN", "DN", "DN",
    "DN", "DP", "DP", "DP", "DP", "DP", "DP",
    "DP", "DP", "SP", "SP", "SP", "SP", "SP",
    "SP", "SP", "SP", "DN", "DN", "DN", "DN",
    "DN", "DN", "DN", "DN"
  ),
  seqs_shared_with = c(
    "DP", "DP", "DP", "DP", "DP", "DP",
    "DP", "DP", "DP", "DP", "DP", "DP",
    "DP", "DP", "DP", "DP", "DP", "DP",
    "DP", "DP", "DP", "DP", "DP", "DP",
    "DP", "DP", "DP", "DP", "DP", "DP",
    "DP", "DP", "SP", "SP", "SP", "SP",
    "SP", "SP", "SP", "SP", "SP", "SP",
    "SP", "SP", "SP", "SP", "SP", "SP",
    "SP", "SP", "SP", "SP", "SP", "SP",
    "SP", "SP", "SP", "SP", "SP", "SP",
    "SP", "SP", "SP", "SP", "DN", "DN",
    "DN", "DN", "DN", "DN", "DN", "DN",
    "DN", "DN", "DN", "DN", "DN", "DN",
    "DN", "DN", "DN", "DN", "DN", "DN",
    "DN", "DN", "DN", "DN", "DN", "DN",
    "DN", "DN", "DN", "DN", "DN", "DN",
    "Blood", "Blood", "Blood", "Blood",
    "Blood", "Blood", "Blood", "Blood",
    "Blood", "Blood", "Blood", "Blood",
    "Blood", "Blood", "Blood", "Blood",
    "Blood", "Blood", "Blood", "Blood",
    "Blood", "Blood", "Blood", "Blood",
    "Blood", "Blood", "Blood", "Blood",
    "Blood", "Blood", "Blood", "Blood",
    "Blood", "Blood", "Blood", "Blood",
    "Blood", "Blood", "Blood", "Blood",
    "Blood", "Blood", "Blood", "Blood",
    "Blood", "Blood", "Blood", "Blood"
  ),
  frac_seqs_shared = c(
    0.0039, 0.0220, 0.0014, 0.0130, 0.0000,
    0.0260, 0.0074, 0.0068, 0.0060, 0.0200,
    0.0011, 0.0140, 0.0000, 0.0310, 0.0100,
    0.0064, 0.0520, 0.1700, 0.0350, 0.0940,
    0.0160, 0.1400, 0.1300, 0.2200, 0.0520,
    0.2000, 0.0310, 0.0790, 0.0000, 0.1800,
    0.1300, 0.1400, 0.1800, 0.4000, 0.2700,
    0.2800, 0.0000, 0.4100, 0.2900, 0.2300,
    0.3000, 0.2900, 0.1700, 0.2800, 0.0500,
    0.2600, 0.1900, 0.2600, 0.2400, 0.4000,
    0.3100, 0.2500, 0.1100, 0.2200, 0.2100,
    0.1600, 0.2800, 0.4400, 0.2000, 0.2700,
    0.2300, 0.4800, 0.6000, 0.3100, 0.0800,
    0.3300, 0.1800, 0.0930, 0.0000, 0.1400,
    0.0490, 0.1200, 0.0880, 0.2700, 0.1600,
    0.0820, 0.0070, 0.0740, 0.0230, 0.1500,
    0.1200, 0.1100, 0.1800, 0.1200, 0.0000,
    0.1000, 0.0350, 0.0580, 0.1400, 0.1000,
    0.1200, 0.1600, 0.0240, 0.1700, 0.0970,
    0.1800, 0.5300, 0.2800, 0.3800, 0.2600,
    0.0000, 0.3600, 0.2600, 0.5000, 0.4500,
    0.2400, 0.2200, 0.2800, 0.2500, 0.2900,
    0.2100, 0.5100, 0.5600, 0.3100, 0.2700,
    0.3900, 0.2800, 0.4100, 0.2800, 0.7000,
    0.4000, 0.1600, 0.2700, 0.2500, 0.2200,
    0.3500, 0.1000, 0.1900, 0.3000, 0.1700,
    0.2100, 0.2800, 0.6000, 0.4900, 0.2500,
    0.4400, 0.4000, 0.3400, 0.2200, 0.3200,
    0.4600, 0.7500, 0.6500, 0.5200
  )
)

# Mutate char columns to factors and filter for relevant data.
# Unused factor levels exist after filtering.
df <- df %>%
  mutate(across(where(is.character), as.factor)) %>%
  filter(t_cell_subset == "CD4", phenotype == "DP")

# Base R stats::t.test() handles unused factor levels.
pairwise.t.test(
  x = df$frac_seqs_shared,
  g = df$seqs_shared_with,
  p.adjust.method = "bonferroni",
  paired = TRUE,
  var.equal = FALSE
)
#> 
#>  Pairwise comparisons using paired t tests 
#> 
#> data:  df$frac_seqs_shared and df$seqs_shared_with 
#> 
#>    Blood DN   
#> DN 0.039 -    
#> SP 0.941 0.013
#> 
#> P value adjustment method: bonferroni

# rstatix::t_test() results in an error.
df %>% pairwise_t_test(frac_seqs_shared ~ seqs_shared_with,
  p.adjust.method = "bonferroni",
  paired = TRUE,
  var.equal = FALSE
)
#> Error in t.test.default(x = c(0.53, 0.28, 0.38, 0.26, 0, 0.36, 0.26, 0.5: not enough 'x' observations

If I manually fix the unused factor levels, rstatix::t_test() now works.

# manually fix unused factor levels and perform t tests
df %>%
  mutate(across(where(is.factor), as.character)) %>%
  mutate(across(where(is.character), as.factor)) %>%
  pairwise_t_test(frac_seqs_shared ~ seqs_shared_with,
    p.adjust.method = "bonferroni",
    paired = TRUE,
    var.equal = FALSE
  )
#> # A tibble: 3 × 10
#>   .y.              group1 group2    n1    n2 statistic    df     p p.adj p.adj.signif
#> * <chr>            <chr>  <chr>  <int> <int>     <dbl> <dbl> <dbl> <dbl> <chr>       
#> 1 frac_seqs_shared Blood  DN         8     8      3.31     7 0.013 0.039 *           
#> 2 frac_seqs_shared Blood  SP         8     8      1.09     7 0.314 0.942 ns          
#> 3 frac_seqs_shared DN     SP         8     8     -4.13     7 0.004 0.013 *
robchallen commented 2 years ago

I can confirm this one. Just got me, slightly simpler example

packageVersion("rstatix")
[1] ‘0.7.0’

# Doesn;t work (and throws mysterious error)
iris %>% filter(Species != "setosa") %>% rstatix::t_test(Sepal.Length ~ Species)

# Works:
t.test(Sepal.Length ~ Species, iris %>% filter(Species != "setosa"))

# Work around:
iris %>% filter(Species != "setosa") %>% 
   mutate(across(where(is.factor), forcats::fct_drop)) %>% 
   rstatix::t_test(Sepal.Length ~ Species)
micwij commented 1 year ago

Also noticed this today. I think it would be good if the function would still work with unused factor levels. Maybe the function could automatically drop unused factor levels. Potentially with a warning. With the current error message, I found it quite tricky to figure out, what was going wrong.

stenglein-lab commented 1 month ago

I ran into this and arrived at same conclusion about unused factor levels. Here's another minimal example, with wilcox_test() using the built-in starwars data object with rstatix v 0.7.2:

> packageVersion("rstatix")
[1] ‘0.7.2’
library(tidyverse)
library(rstatix)

# use built-in starwars tibble
sw <- starwars

# cast sex variable (currently chr type) to factor
sw$sex <- as.factor(sw$sex)

# how many characters of different sexes?
sw %>% group_by(sex) %>% summarize(n=n())

# let's just keep male and female characters for this analysis
sw <- filter(sw, str_detect(sex, "ale"))

# run a wilcoxon test: height as a function of sex 
# this errors
sw %>% wilcox_test(height ~ sex)

# drop unused levels from factor 
sw$sex <- droplevels(sw$sex)

# no longer errors
sw %>% wilcox_test(height ~ sex)