Closed mattansb closed 1 year ago
@mattansb I think the new get_data()
"philosophy" is that we should return the original data, mostly untransformed, except for listwise deletion and column subsets. So I agree that the long format is better: it matches mtcars
. However, I don't think we should transform the second column to factor.
Fair enough, but then you can argue that the vector inputs should also remain free vectors, in a list?
Getting data from htest objects is 🗑️+🔥
get_data()
for htest, but I'm not sure why the last example fails? According to my code, it should work! :-Dlibrary(insight)
tt1 <- t.test(mtcars$mpg)
tt2 <- t.test(mtcars$mpg ~ 1)
insight::get_data(tt1) |> head()
#> mpg
#> 1 21.0
#> 2 21.0
#> 3 22.8
#> 4 21.4
#> 5 18.7
#> 6 18.1
insight::get_data(tt2) |> head()
#> mpg
#> 1 21.0
#> 2 21.0
#> 3 22.8
#> 4 21.4
#> 5 18.7
#> 6 18.1
# Two sample
tt3 <- t.test(mtcars$mpg ~ mtcars$am)
tt4 <- t.test(mtcars$mpg[mtcars$am==0], mtcars$mpg[mtcars$am==1])
insight::get_data(tt3) |> head()
#> x y
#> 1 21.0 1
#> 2 21.0 1
#> 3 22.8 1
#> 4 21.4 1
#> 5 18.7 1
#> 6 18.1 1
insight::get_data(tt4) |> head()
#> x y
#> 1 21.4 1
#> 2 18.7 1
#> 3 18.1 1
#> 4 14.3 1
#> 5 24.4 1
#> 6 22.8 1
# Paired
tt5 <- t.test(sleep$extra ~ sleep$group, paired = TRUE)
tt6 <- t.test(sleep$extra[sleep$group == "1"], sleep$extra[sleep$group == "2"], paired = TRUE)
tt7 <- t.test(Pair(sleep$extra[sleep$group == "1"], sleep$extra[sleep$group == "2"]) ~ 1)
insight::get_data(tt5) |> head()
#> NULL
insight::get_data(tt6) |> head()
#> x y
#> 1 0.7 1
#> 2 -1.6 1
#> 3 -0.2 1
#> 4 -1.2 1
#> 5 -0.1 1
#> 6 3.4 1
insight::get_data(tt7) |> head()
#> x y
#> A 0.7 1.9
#> B -1.6 0.8
#> C -0.2 1.1
#> D -1.2 0.1
#> E -0.1 -0.1
#> F 3.4 4.4
Created on 2023-02-21 with reprex v2.0.2
Weird that tt5
doesn't work...
I think tt7
should also be "long", no?
tt5 should work now, forgot to check for "by". yes, tt7 should be long, and the code should do this, but doesn't work.
tt7 is still open...
tt7 is hard. I think it should return a single Pair
column - what do you think?
df <- data.frame(
x = I(Pair(sleep$extra[sleep$group == "1"], sleep$extra[sleep$group == "2"]))
)
str(df)
#> 'data.frame': 10 obs. of 1 variable:
#> $ x: 'AsIs' num [1:10, 1:2] 0.7 -1.6 -0.2 -1.2 -0.1 3.4 3.7 0.8 0 2 ...
#> ..- attr(*, "dimnames")=List of 2
#> .. ..$ : NULL
#> .. ..$ : chr [1:2] "x" "y"
Created on 2023-02-21 with reprex v2.0.2
@strengejacke insight::get_data()
for tt1
-tt7
should also apply to wilcox.test()
:
x <<- 1:4
y <<- c(1, 1:3)
t.test(x, y) |> insight::get_data()
#> x y
#> 1 1 1
#> 2 2 1
#> 3 3 1
#> 4 4 1
#> 5 1 2
#> 6 1 2
#> 7 2 2
#> 8 3 2
wilcox.test(x, y) |> insight::get_data()
#> Warning in wilcox.test.default(x, y): cannot compute exact p-value with ties
#> x y
#> 1 1 1
#> 2 2 1
#> 3 3 2
#> 4 4 3
Created on 2023-02-24 with reprex v2.0.2
tt7 is hard. I think it should return a single
Pair
column - what do you think?
If have this code:
} else if (startsWith(x$method, "Paired")) {
if (grepl(" (and|by) ", x$data.name)) {
return(data.frame(
x = unlist(columns),
y = c(
rep("1", length(columns[[1]])),
rep("2", length(columns[[2]]))
),
stringsAsFactors = TRUE
))
} else if (startsWith(x$data.name, "Pair(")) {
return(data.frame(
x = c(columns[[1]][, 1, drop = TRUE], columns[[1]][, 2, drop = TRUE]),
y = c(
rep("1", nrow(columns[[1]])),
rep("2", nrow(columns[[1]]))
),
stringsAsFactors = TRUE
))
}
}
But it doesn't seem to be executed when I have a tt7-model? Strange...
It's because it is picked up by:
if (!grepl(" (and|by) ", x$data.name) &&
(startsWith(x$method, "McNemar") || (length(columns) == 1 && is.matrix(columns[[1]])))) {
return(as.table(columns[[1]]))
# check if data is a list for kruskal-wallis
}
Here are some benchmarks for all of these.
library(insight)
# One sample
tt1 <- t.test(mtcars$mpg)
tt2 <- t.test(mtcars$mpg ~ 1)
wt1 <- wilcox.test(mtcars$mpg)
wt2 <- wilcox.test(mtcars$mpg ~ 1)
# Two sample
tt3 <- t.test(mtcars$mpg ~ mtcars$am)
tt4 <- t.test(mtcars$mpg[mtcars$am==0], mtcars$mpg[mtcars$am==1])
wt3 <- wilcox.test(mtcars$mpg ~ mtcars$am)
wt4 <- wilcox.test(mtcars$mpg[mtcars$am==0], mtcars$mpg[mtcars$am==1])
# Paired
tt5 <- t.test(sleep$extra ~ sleep$group, paired = TRUE)
tt6 <- t.test(sleep$extra[sleep$group == "1"], sleep$extra[sleep$group == "2"], paired = TRUE)
tt7 <- t.test(Pair(sleep$extra[sleep$group == "1"], sleep$extra[sleep$group == "2"]) ~ 1)
wt5 <- wilcox.test(sleep$extra ~ sleep$group, paired = TRUE)
wt6 <- wilcox.test(sleep$extra[sleep$group == "1"], sleep$extra[sleep$group == "2"], paired = TRUE)
wt7 <- wilcox.test(Pair(sleep$extra[sleep$group == "1"], sleep$extra[sleep$group == "2"]) ~ 1)
expect_equal(dim(get_data(tt1)), c(32, 1))
expect_equal(dim(get_data(tt2)), c(32, 1))
expect_equal(dim(get_data(tt3)), c(32, 2))
expect_equal(dim(get_data(tt4)), c(32, 2))
expect_equal(dim(get_data(tt5)), c(20, 2)) # FAILS
expect_equal(dim(get_data(tt6)), c(20, 2))
expect_equal(dim(get_data(tt7)), c(20, 2)) # FAILS
library(testthat)
expect_equal(get_data(wt1), get_data(tt1))
expect_equal(get_data(wt2), get_data(tt2))
expect_equal(get_data(wt3), get_data(tt3)) # FAILS
expect_equal(get_data(wt4), get_data(tt4)) # FAILS
expect_equal(get_data(wt5), get_data(tt5)) # FAILS
expect_equal(get_data(wt6), get_data(tt6)) # FAILS
expect_equal(get_data(wt7), get_data(tt7))
works now:
library(insight)
library(testthat)
data(mtcars)
data(sleep)
# One sample
tt1 <- t.test(mtcars$mpg)
tt2 <- t.test(mtcars$mpg ~ 1)
wt1 <- wilcox.test(mtcars$mpg)
#> Warning in wilcox.test.default(mtcars$mpg): cannot compute exact p-value with
#> ties
wt2 <- wilcox.test(mtcars$mpg ~ 1)
#> Warning in wilcox.test.default(x = respVar, ...): cannot compute exact p-value
#> with ties
# Two sample
tt3 <- t.test(mtcars$mpg ~ mtcars$am)
tt4 <- t.test(mtcars$mpg[mtcars$am==0], mtcars$mpg[mtcars$am==1])
wt3 <- wilcox.test(mtcars$mpg ~ mtcars$am)
#> Warning in wilcox.test.default(x = DATA[[1L]], y = DATA[[2L]], ...): cannot
#> compute exact p-value with ties
wt4 <- wilcox.test(mtcars$mpg[mtcars$am==0], mtcars$mpg[mtcars$am==1])
#> Warning in wilcox.test.default(mtcars$mpg[mtcars$am == 0], mtcars$mpg[mtcars$am
#> == : cannot compute exact p-value with ties
# Paired
tt5 <- t.test(sleep$extra ~ sleep$group, paired = TRUE)
tt6 <- t.test(sleep$extra[sleep$group == "1"], sleep$extra[sleep$group == "2"], paired = TRUE)
tt7 <- t.test(Pair(sleep$extra[sleep$group == "1"], sleep$extra[sleep$group == "2"]) ~ 1)
wt5 <- wilcox.test(sleep$extra ~ sleep$group, paired = TRUE)
#> Warning in wilcox.test.default(x = DATA[[1L]], y = DATA[[2L]], ...): cannot
#> compute exact p-value with ties
#> Warning in wilcox.test.default(x = DATA[[1L]], y = DATA[[2L]], ...): cannot
#> compute exact p-value with zeroes
wt6 <- wilcox.test(sleep$extra[sleep$group == "1"], sleep$extra[sleep$group == "2"], paired = TRUE)
#> Warning in wilcox.test.default(sleep$extra[sleep$group == "1"],
#> sleep$extra[sleep$group == : cannot compute exact p-value with ties
#> Warning in wilcox.test.default(sleep$extra[sleep$group == "1"],
#> sleep$extra[sleep$group == : cannot compute exact p-value with zeroes
wt7 <- wilcox.test(Pair(sleep$extra[sleep$group == "1"], sleep$extra[sleep$group == "2"]) ~ 1)
#> Warning in wilcox.test.default(x = respVar[, 1L], y = respVar[, 2L], paired =
#> TRUE, : cannot compute exact p-value with ties
#> Warning in wilcox.test.default(x = respVar[, 1L], y = respVar[, 2L], paired =
#> TRUE, : cannot compute exact p-value with zeroes
expect_equal(dim(get_data(tt1)), c(32, 1))
expect_equal(dim(get_data(tt2)), c(32, 1))
expect_equal(dim(get_data(tt3)), c(32, 2))
expect_equal(dim(get_data(tt4)), c(32, 2))
expect_equal(dim(get_data(tt5)), c(20, 2)) # FAILS
expect_equal(dim(get_data(tt6)), c(20, 2))
expect_equal(dim(get_data(tt7)), c(20, 2)) # FAILS
expect_equal(get_data(wt1), get_data(tt1))
expect_equal(get_data(wt2), get_data(tt2))
expect_equal(get_data(wt3), get_data(tt3)) # FAILS
expect_equal(get_data(wt4), get_data(tt4)) # FAILS
expect_equal(get_data(wt5), get_data(tt5)) # FAILS
expect_equal(get_data(wt6), get_data(tt6)) # FAILS
expect_equal(get_data(wt7), get_data(tt7))
Created on 2023-02-24 with reprex v2.0.2
what about other htests?
I refactored the code in .retrieve_htest_data()
a bit, so it's easier to follow and maintain.
As far as I know, the other htests are okay (:
Thanks Daniel!
For independant t-tests, we can get two different data structures from
get_data()
depending on the input to thet.test()
:Created on 2023-02-19 with reprex v2.0.2
(This is also true for
wilcox.test()
).I personally prefer the first ("long") format, with the second column converted to a factor.
(This issue needs to be resolved for a reliable fix for https://github.com/easystats/effectsize/issues/563 / https://github.com/easystats/effectsize/pull/564)