Closed AMBarbosa closed 2 weeks ago
I realize this error actually comes from function vif_df()
, when used in the loop for (i in seq(from = nrow(df.rank), to = 2))
inside vif_select()
. Boiling it down, vif_df()
fails with this:
df <- structure(list(var3 = c(-2.47126191613724, -1.28045397739883,
-2.87943315893136, -2.39505691242115, -3.52312642864623, -2.15787376714855,
-2.8470635995551, -3.40621084907316, -1.82084801567374, -2.89613462680119,
-3.09295280686465, -1.54788702795955, 0.197716811658935), var4 = c(-1.08580714992212,
-0.872818251287383, 0.338808598563681, -1.3592415601649, -1.13893772547255,
-0.895329184841717, -0.736786740600295, -1.283865177828, -1.76003359460473,
-0.941573502380344, -1.62995926634296, -1.60124171841818, -0.0777887868580415
), var9 = c(-0.499059586130419, -0.548405050601784, -0.602135355161387,
-1.24102651506362, -0.781283220413801, -0.847883011063252, -0.744918212071736,
-1.37533655145803, 0.0428460256878719, -1.6367346291579, -0.453181067037663,
-0.776049296438161, -1.43481797357956), var2 = c(1.61724031174641,
1.37117010796349, 1.12537378915446, 1.59745859986818, 0.654943754764419,
0.926529580498604, 1.80409453277929, 0.682222989926673, 0.644254244524167,
2.44851929669553, 0.881815656889134, 0.719922484314521, 1.40464231098472
), var13 = c(7.41040266538704, 13.5312583954783, 7.41040266538704,
7.41040266538704, 8.59618919764273, 7.41040266538704, 13.5312583954783,
7.41040266538704, 7.41040266538704, 7.41040266538704, 12.7635158716984,
7.41040266538704, 12.7635158716984), var7 = c(-0.59723293760077,
-0.376209757038625, -2.12490054154511, -1.06663178472767, -1.99739835379414,
-0.829909212350894, -1.08729797610347, -1.94584018113948, -1.41076728315705,
-1.98272047148696, -1.27098858018749, -1.92633528579173, -1.53504019121613
), var1 = c(3.95160651959727, 5.66322715573043, 4.78428682906746,
3.95160651959727, 8.64699826466527, 5.22375699239895, 2.21685587486771,
6.26460737923668, 7.32858777467081, 5.10810694941697, 13.4349100441188,
17.8296116774337, 4.43733670012155), var8 = c(-0.26816575471459,
-2.22109405865227, -0.684925127360919, -0.610437202620312, -0.992706989333577,
-2.22109405865227, -2.22109405865227, -1.18471391327617, -0.0453480011081498,
-2.22109405865227, -2.22109405865227, 0.458106246115031, -0.668273210278581
), var5 = c(-2.13956432564926, -2.53280664751937, -1.98388315078687,
-1.98901371885298, -2.1294980103762, -2.53280664751937, -1.79970373457131,
-2.28463259624605, -1.80597162662216, -2.53280664751937, -2.53280664751937,
-1.88967135860789, -1.44655372200994), var10 = c(1.68807994597818,
2.18335753902296, 1.71114038431881, 1.01868449149111, 1.23601260419722,
2.19923133723404, 1.48807907883862, 1.51345225278989, 0.851562581385538,
1.94244266486479, 1.99509237036138, 0.825553744909574, 1.88026593557423
), var12 = c(10.7085516923292, 10.7085516923292, 7.58484953479799,
7.58484953479799, 7.58484953479799, 7.58484953479799, 10.7085516923292,
10.7085516923292, 7.58484953479799, 10.7085516923292, 7.58484953479799,
10.7085516923292, 10.7085516923292), var11 = c(1.39792107350584,
0.531589122501453, 0.0122424604371142, -0.331594575349984, 0.21151285716957,
-0.140321572340113, 0.243592808161356, -0.0668651403382106, 0.0183053082573835,
0.143598127729515, -0.122690138644377, 0.00103031167210884, 0.0162193243659431
), var6 = c(5.36188342506708, -0.515563106726135, 5.03519119718127,
3.07867425065343, 7.50496027705085, -0.771442186896905, -0.594395224388743,
3.15879730601383, 4.53699767490813, -0.467873570172355, -0.671221992183987,
0.996382238141532, 0.238651826800801)), class = "data.frame", row.names = c(NA,
-13L))
vif_df(df)
Error in value[[3L]](cond) :
the VIF computation failed. Please use cor_df() or cor_select() to check and remove perfect correlations from df before the VIF assessment.
In addition: Warning message:
In validate_df(df = df, min_rows = 30) :
the number of rows in 'df' is lower than 30. A multicollinearity analysis may fail or yield meaningless results.
Whereas:
cor_df(df)
# A tibble: 78 × 3
# x y correlation
# <chr> <chr> <dbl>
# 1 var10 var8 -0.75
# 2 var5 var8 0.645
# 3 var6 var8 0.61
# 4 var1 var2 -0.555
# 5 var10 var5 -0.553
# 6 var6 var13 -0.538
# 7 var1 var4 -0.533
# 8 var6 var10 -0.517
# 9 var8 var13 -0.503
# 10 var5 var3 0.477
# # ℹ 68 more rows
# # ℹ Use `print(n = ...)` to see more rows
# Warning message:
# In validate_df(df = df, min_rows = ifelse(test = cor_method == "pearson", :
# the number of rows in 'df' is lower than 30. A multicollinearity analysis may fail or yield meaningless results.
It does not fail if you instead use df <- df[1:12, ]
, or if you exclude any of the variables.
Within vif_df()
, the error seems to happen inside the tryCatch
, at:
solve(cor.matrix, tol = tolerance)
# Error in solve.default(cor.matrix, tol = tolerance) :
# Lapack routine dgesv: system is exactly singular: U[13,13] = 0
This error may indeed be caused by perfect correlations, but that doesn't seem to be the cause here. For a more informative error message, I suppose you could add before the tryCatch
within vif_df()
:
if (det(cor.matrix) == 0) stop("The variable correlation matrix is singular, so it cannot be solved.")
But ideally there would be an alternative way to compute the result in these cases?
Regards!
Clues from computing VIF with other packages: with both {usdm} and {fuzzySim}, the VIF is infinite for all variables (and the R-squared is 1), as each variable is perfectly explained by the combination of all others. What are the odds of this happening in an actual ecological dataset?
names(df)
# [1] "var3" "var4" "var9" "var2" "var13" "var7" "var1" "var8" "var5"
# [10] "var10" "var12" "var11" "var6"
# [ variables as reordered inside vif_select() before being passed to vif_df() ]
usdm::vif(df)
# Variables VIF
# 1 var3 Inf
# 2 var4 Inf
# 3 var9 Inf
# 4 var2 Inf
# 5 var13 Inf
# 6 var7 Inf
# 7 var1 Inf
# 8 var8 Inf
# 9 var5 Inf
# 10 var10 Inf
# 11 var12 Inf
# 12 var11 Inf
# 13 var6 Inf
fuzzySim::multicol(df)
# Rsquared Tolerance VIF
# var3 1 0 Inf
# var4 1 0 Inf
# var9 1 0 Inf
# var2 1 0 Inf
# var13 1 0 Inf
# var7 1 0 Inf
# var1 1 0 Inf
# var8 1 0 Inf
# var5 1 0 Inf
# var10 1 0 Inf
# var12 1 0 Inf
# var11 1 0 Inf
# var6 1 0 Inf
heatmap(cor(df))
I realize now that this particular data frame has exactly the same number of rows as predictors, in which case their R-squared is indeed 100% (thus VIF infinite). I hadn't noticed this, and what confused me was that the vif_df()
error doesn't happen when I reduce the number of rows to even less:
collinear::vif_df(df[1:6, ])
# variable vif
# 1 var6 0.000000e+00
# 2 var7 3.904729e+14
# 3 var4 6.476413e+14
# 4 var9 1.828220e+15
# 5 var1 1.903784e+15
# 6 var5 4.072936e+15
# 7 var2 6.624570e+15
# 8 var10 1.040626e+16
# 9 var12 1.050840e+16
# 10 var8 1.156818e+16
# 11 var11 1.221745e+16
# 12 var13 1.362184e+16
# 13 var3 3.553314e+16
# Warning message:
# In validate_df(df = df, min_rows = 30) :
# the number of rows in 'df' is lower than 30. A multicollinearity analysis may fail or yield meaningless results.
Whereas:
fuzzySim::multicol(df[1:6, ])
# Rsquared Tolerance VIF
# var3 1 0 Inf
# var4 1 0 Inf
# var9 1 0 Inf
# var2 1 0 Inf
# var13 1 0 Inf
# var7 1 0 Inf
# var1 1 0 Inf
# var8 1 0 Inf
# var5 1 0 Inf
# var10 1 0 Inf
# var12 1 0 Inf
# var11 1 0 Inf
# var6 1 0 Inf
vif_df()
produces very large but not infinite values, and not all equal to each other, so I think this is a bug. I don't think the warning about <30 rows is enough to prevent mistakes or misuse. So, I would add, before computing cor.matrix
within vif_df()
, something like:
df <- df[, predictors.numeric, drop = FALSE]
if(nrow(df) <= ncol(df)) stop("No more data points than predictors, so VIF is infinite for all predictors.")
Dear @AMBarbosa, Thanks for the report. That's an issue that bugged me during development, and I took a shortcut to address it, thinking that it would be a rare event in ecological data. Let me check it in detail, and I will get back at you as soon as possible. Best, Blas
Most of the issues you have reported should be fixed with version 1.1.2, now in the development branch.
However, as shown below, I could not replicate the singular matrix problem described in this issue. The fact that the function solve() (called within vif_df()) is returning large integers instead of Inf like fuzzySim::multicol() and usdm::vif() might come down to the matrix decomposition method used by lm(). A vif larger than a given threshold is a valid diagnostic, no matter how large the reported vif is, so I cannot address any bugs here.
Still, your comment about the message warning of perfect correlations has been addressed, as not all failures in solve() come from such issue. Additionally, validate_df() now warns the user when the data frame dimensions might cause issues during the multicollinearity analysis.
Thanks a lot for your time testing this package!
Best,
Blas
> df <- structure(list(var3 = c(-2.47126191613724, -1.28045397739883,
+ -2.87943315893136, -2.39505691242115, -3.52312642864623, -2.15787376714855,
+ -2.8470635995551, -3.40621084907316, -1.82084801567374, -2.89613462680119,
+ -3.09295280686465, -1.54788702795955, 0.197716811658935), var4 = c(-1.08580714992212,
+ -0.872818251287383, 0.338808598563681, -1.3592415601649, -1.13893772547255,
+ -0.895329184841717, -0.736786740600295, -1.283865177828, -1.76003359460473,
+ -0.941573502380344, -1.62995926634296, -1.60124171841818, -0.0777887868580415
+ ), var9 = c(-0.499059586130419, -0.548405050601784, -0.602135355161387,
+ -1.24102651506362, -0.781283220413801, -0.847883011063252, -0.744918212071736,
+ -1.37533655145803, 0.0428460256878719, -1.6367346291579, -0.453181067037663,
+ -0.776049296438161, -1.43481797357956), var2 = c(1.61724031174641,
+ 1.37117010796349, 1.12537378915446, 1.59745859986818, 0.654943754764419,
+ 0.926529580498604, 1.80409453277929, 0.682222989926673, 0.644254244524167,
+ 2.44851929669553, 0.881815656889134, 0.719922484314521, 1.40464231098472
+ ), var13 = c(7.41040266538704, 13.5312583954783, 7.41040266538704,
+ 7.41040266538704, 8.59618919764273, 7.41040266538704, 13.5312583954783,
+ 7.41040266538704, 7.41040266538704, 7.41040266538704, 12.7635158716984,
+ 7.41040266538704, 12.7635158716984), var7 = c(-0.59723293760077,
+ -0.376209757038625, -2.12490054154511, -1.06663178472767, -1.99739835379414,
+ -0.829909212350894, -1.08729797610347, -1.94584018113948, -1.41076728315705,
+ -1.98272047148696, -1.27098858018749, -1.92633528579173, -1.53504019121613
+ ), var1 = c(3.95160651959727, 5.66322715573043, 4.78428682906746,
+ 3.95160651959727, 8.64699826466527, 5.22375699239895, 2.21685587486771,
+ 6.26460737923668, 7.32858777467081, 5.10810694941697, 13.4349100441188,
+ 17.8296116774337, 4.43733670012155), var8 = c(-0.26816575471459,
+ -2.22109405865227, -0.684925127360919, -0.610437202620312, -0.992706989333577,
+ -2.22109405865227, -2.22109405865227, -1.18471391327617, -0.0453480011081498,
+ -2.22109405865227, -2.22109405865227, 0.458106246115031, -0.668273210278581
+ ), var5 = c(-2.13956432564926, -2.53280664751937, -1.98388315078687,
+ -1.98901371885298, -2.1294980103762, -2.53280664751937, -1.79970373457131,
+ -2.28463259624605, -1.80597162662216, -2.53280664751937, -2.53280664751937,
+ -1.88967135860789, -1.44655372200994), var10 = c(1.68807994597818,
+ 2.18335753902296, 1.71114038431881, 1.01868449149111, 1.23601260419722,
+ 2.19923133723404, 1.48807907883862, 1.51345225278989, 0.851562581385538,
+ 1.94244266486479, 1.99509237036138, 0.825553744909574, 1.88026593557423
+ ), var12 = c(10.7085516923292, 10.7085516923292, 7.58484953479799,
+ 7.58484953479799, 7.58484953479799, 7.58484953479799, 10.7085516923292,
+ 10.7085516923292, 7.58484953479799, 10.7085516923292, 7.58484953479799,
+ 10.7085516923292, 10.7085516923292), var11 = c(1.39792107350584,
+ 0.531589122501453, 0.0122424604371142, -0.331594575349984, 0.21151285716957,
+ -0.140321572340113, 0.243592808161356, -0.0668651403382106, 0.0183053082573835,
+ 0.143598127729515, -0.122690138644377, 0.00103031167210884, 0.0162193243659431
+ ), var6 = c(5.36188342506708, -0.515563106726135, 5.03519119718127,
+ 3.07867425065343, 7.50496027705085, -0.771442186896905, -0.594395224388743,
+ 3.15879730601383, 4.53699767490813, -0.467873570172355, -0.671221992183987,
+ 0.996382238141532, 0.238651826800801)), class = "data.frame", row.names = c(NA,
+ -13L))
>
>
> vif_df(df)
The number of rows in 'df' should be higher than 30 to ensure a successful multicollinearity analysis.
variable vif
1 var3 789261567120
2 var4 29827966032132
3 var9 356323676019566
4 var2 542963800777175
5 var13 631532969336265
6 var7 866389312101253
7 var1 1189716474071954
8 var8 1204104894968436
9 var5 1284861471669189
10 var10 1294584939394963
11 var12 1312969098046344
12 var11 2227985683922812
13 var6 3132938871214258
Dear Blas,
vif_select()
is giving me an error message about perfect correlations, when I don't have those in my data: my strongest correlation is just above 0.75. Here's a reprex:However:
Thank you in advance!