BlasBenito / collinear

R package to manage multicollinearity in modeling data frames.
https://blasbenito.github.io/collinear/
Other
9 stars 1 forks source link

vif_select() error "perfect correlations" when these don't exist #7

Closed AMBarbosa closed 2 weeks ago

AMBarbosa commented 1 month ago

Dear Blas, vif_select() is giving me an error message about perfect correlations, when I don't have those in my data: my strongest correlation is just above 0.75. Here's a reprex:

df <- structure(list(resp = c(15.453450403671, 13.1060632491908, 13.2949628273, 
10.5966847318461, 8.59618919764273, 0.693147180559945, 13.9564535417658, 
5.02388052084628, 1.79175946922805, 10.1267510966509, 10.5363538022111, 
2.30258509299405, 14.9906779411858), var1 = c(3.95160651959727, 
5.66322715573043, 4.78428682906746, 3.95160651959727, 8.64699826466527, 
5.22375699239895, 2.21685587486771, 6.26460737923668, 7.32858777467081, 
5.10810694941697, 13.4349100441188, 17.8296116774337, 4.43733670012155
), var2 = c(1.61724031174641, 1.37117010796349, 1.12537378915446, 
1.59745859986818, 0.654943754764419, 0.926529580498604, 1.80409453277929, 
0.682222989926673, 0.644254244524167, 2.44851929669553, 0.881815656889134, 
0.719922484314521, 1.40464231098472), var3 = c(-2.47126191613724, 
-1.28045397739883, -2.87943315893136, -2.39505691242115, -3.52312642864623, 
-2.15787376714855, -2.8470635995551, -3.40621084907316, -1.82084801567374, 
-2.89613462680119, -3.09295280686465, -1.54788702795955, 0.197716811658935
), var4 = c(-1.08580714992212, -0.872818251287383, 0.338808598563681, 
-1.3592415601649, -1.13893772547255, -0.895329184841717, -0.736786740600295, 
-1.283865177828, -1.76003359460473, -0.941573502380344, -1.62995926634296, 
-1.60124171841818, -0.0777887868580415), var5 = c(-2.13956432564926, 
-2.53280664751937, -1.98388315078687, -1.98901371885298, -2.1294980103762, 
-2.53280664751937, -1.79970373457131, -2.28463259624605, -1.80597162662216, 
-2.53280664751937, -2.53280664751937, -1.88967135860789, -1.44655372200994
), var6 = c(5.36188342506708, -0.515563106726135, 5.03519119718127, 
3.07867425065343, 7.50496027705085, -0.771442186896905, -0.594395224388743, 
3.15879730601383, 4.53699767490813, -0.467873570172355, -0.671221992183987, 
0.996382238141532, 0.238651826800801), var7 = c(-0.59723293760077, 
-0.376209757038625, -2.12490054154511, -1.06663178472767, -1.99739835379414, 
-0.829909212350894, -1.08729797610347, -1.94584018113948, -1.41076728315705, 
-1.98272047148696, -1.27098858018749, -1.92633528579173, -1.53504019121613
), var8 = c(-0.26816575471459, -2.22109405865227, -0.684925127360919, 
-0.610437202620312, -0.992706989333577, -2.22109405865227, -2.22109405865227, 
-1.18471391327617, -0.0453480011081498, -2.22109405865227, -2.22109405865227, 
0.458106246115031, -0.668273210278581), var9 = c(-0.499059586130419, 
-0.548405050601784, -0.602135355161387, -1.24102651506362, -0.781283220413801, 
-0.847883011063252, -0.744918212071736, -1.37533655145803, 0.0428460256878719, 
-1.6367346291579, -0.453181067037663, -0.776049296438161, -1.43481797357956
), var10 = c(1.68807994597818, 2.18335753902296, 1.71114038431881, 
1.01868449149111, 1.23601260419722, 2.19923133723404, 1.48807907883862, 
1.51345225278989, 0.851562581385538, 1.94244266486479, 1.99509237036138, 
0.825553744909574, 1.88026593557423), var11 = c(1.39792107350584, 
0.531589122501453, 0.0122424604371142, -0.331594575349984, 0.21151285716957, 
-0.140321572340113, 0.243592808161356, -0.0668651403382106, 0.0183053082573835, 
0.143598127729515, -0.122690138644377, 0.00103031167210884, 0.0162193243659431
), var12 = structure(c(1L, 1L, 2L, 2L, 2L, 2L, 1L, 1L, 2L, 1L, 
2L, 1L, 1L), .Label = c("No", "Yes"), class = "factor"), var13 = structure(c(1L, 
3L, 1L, 1L, 5L, 1L, 3L, 1L, 1L, 1L, 6L, 1L, 6L), .Label = c("CR", 
"DD", "EN", "LC", "NT", "VU"), class = "factor")), row.names = c(NA, 
-13L), class = "data.frame")

vif_select(df, predictors = names(df)[-1], response = names(df)[1])

# Error in value[[3L]](cond) : 
#   the VIF computation failed. Please use cor_df() or cor_select() to check and remove perfect correlations from df before the VIF assessment.
# In addition: Warning messages:
#   1: In validate_df(df = df, min_rows = 30) :
#   the number of rows in 'df' is lower than 30. A multicollinearity analysis may fail or yield meaningless results.
# 2: In validate_df(df = df, min_rows = 30) :
#   the number of rows in 'df' is lower than 30. A multicollinearity analysis may fail or yield meaningless results.

However:

cor_df(df)

# A tibble: 92 × 3
# x     y     correlation
# <chr> <chr>       <dbl>
# 1 var6  var13       0.758
# 2 var10 var8       -0.75 
# 3 var8  var5        0.645
# 4 var8  var6        0.61 
# 5 var2  resp        0.601
# 6 var7  var13       0.568
# 7 var4  resp        0.559
# 8 var2  var1       -0.555
# 9 var10 var5       -0.553
# 10 var8  var13       0.539
# ℹ 82 more rows
# ℹ Use `print(n = ...)` to see more rows

Thank you in advance!

AMBarbosa commented 1 month ago

I realize this error actually comes from function vif_df(), when used in the loop for (i in seq(from = nrow(df.rank), to = 2)) inside vif_select(). Boiling it down, vif_df() fails with this:

df <- structure(list(var3 = c(-2.47126191613724, -1.28045397739883, 
-2.87943315893136, -2.39505691242115, -3.52312642864623, -2.15787376714855, 
-2.8470635995551, -3.40621084907316, -1.82084801567374, -2.89613462680119, 
-3.09295280686465, -1.54788702795955, 0.197716811658935), var4 = c(-1.08580714992212, 
-0.872818251287383, 0.338808598563681, -1.3592415601649, -1.13893772547255, 
-0.895329184841717, -0.736786740600295, -1.283865177828, -1.76003359460473, 
-0.941573502380344, -1.62995926634296, -1.60124171841818, -0.0777887868580415
), var9 = c(-0.499059586130419, -0.548405050601784, -0.602135355161387, 
-1.24102651506362, -0.781283220413801, -0.847883011063252, -0.744918212071736, 
-1.37533655145803, 0.0428460256878719, -1.6367346291579, -0.453181067037663, 
-0.776049296438161, -1.43481797357956), var2 = c(1.61724031174641, 
1.37117010796349, 1.12537378915446, 1.59745859986818, 0.654943754764419, 
0.926529580498604, 1.80409453277929, 0.682222989926673, 0.644254244524167, 
2.44851929669553, 0.881815656889134, 0.719922484314521, 1.40464231098472
), var13 = c(7.41040266538704, 13.5312583954783, 7.41040266538704, 
7.41040266538704, 8.59618919764273, 7.41040266538704, 13.5312583954783, 
7.41040266538704, 7.41040266538704, 7.41040266538704, 12.7635158716984, 
7.41040266538704, 12.7635158716984), var7 = c(-0.59723293760077, 
-0.376209757038625, -2.12490054154511, -1.06663178472767, -1.99739835379414, 
-0.829909212350894, -1.08729797610347, -1.94584018113948, -1.41076728315705, 
-1.98272047148696, -1.27098858018749, -1.92633528579173, -1.53504019121613
), var1 = c(3.95160651959727, 5.66322715573043, 4.78428682906746, 
3.95160651959727, 8.64699826466527, 5.22375699239895, 2.21685587486771, 
6.26460737923668, 7.32858777467081, 5.10810694941697, 13.4349100441188, 
17.8296116774337, 4.43733670012155), var8 = c(-0.26816575471459, 
-2.22109405865227, -0.684925127360919, -0.610437202620312, -0.992706989333577, 
-2.22109405865227, -2.22109405865227, -1.18471391327617, -0.0453480011081498, 
-2.22109405865227, -2.22109405865227, 0.458106246115031, -0.668273210278581
), var5 = c(-2.13956432564926, -2.53280664751937, -1.98388315078687, 
-1.98901371885298, -2.1294980103762, -2.53280664751937, -1.79970373457131, 
-2.28463259624605, -1.80597162662216, -2.53280664751937, -2.53280664751937, 
-1.88967135860789, -1.44655372200994), var10 = c(1.68807994597818, 
2.18335753902296, 1.71114038431881, 1.01868449149111, 1.23601260419722, 
2.19923133723404, 1.48807907883862, 1.51345225278989, 0.851562581385538, 
1.94244266486479, 1.99509237036138, 0.825553744909574, 1.88026593557423
), var12 = c(10.7085516923292, 10.7085516923292, 7.58484953479799, 
7.58484953479799, 7.58484953479799, 7.58484953479799, 10.7085516923292, 
10.7085516923292, 7.58484953479799, 10.7085516923292, 7.58484953479799, 
10.7085516923292, 10.7085516923292), var11 = c(1.39792107350584, 
0.531589122501453, 0.0122424604371142, -0.331594575349984, 0.21151285716957, 
-0.140321572340113, 0.243592808161356, -0.0668651403382106, 0.0183053082573835, 
0.143598127729515, -0.122690138644377, 0.00103031167210884, 0.0162193243659431
), var6 = c(5.36188342506708, -0.515563106726135, 5.03519119718127, 
3.07867425065343, 7.50496027705085, -0.771442186896905, -0.594395224388743, 
3.15879730601383, 4.53699767490813, -0.467873570172355, -0.671221992183987, 
0.996382238141532, 0.238651826800801)), class = "data.frame", row.names = c(NA, 
-13L))

vif_df(df)

Error in value[[3L]](cond) : 
  the VIF computation failed. Please use cor_df() or cor_select() to check and remove perfect correlations from df before the VIF assessment.
In addition: Warning message:
In validate_df(df = df, min_rows = 30) :
  the number of rows in 'df' is lower than 30. A multicollinearity analysis may fail or yield meaningless results.

Whereas:

cor_df(df)

# A tibble: 78 × 3
# x     y     correlation
# <chr> <chr>       <dbl>
#   1 var10 var8       -0.75 
# 2 var5  var8        0.645
# 3 var6  var8        0.61 
# 4 var1  var2       -0.555
# 5 var10 var5       -0.553
# 6 var6  var13      -0.538
# 7 var1  var4       -0.533
# 8 var6  var10      -0.517
# 9 var8  var13      -0.503
# 10 var5  var3        0.477
# # ℹ 68 more rows
# # ℹ Use `print(n = ...)` to see more rows
# Warning message:
#   In validate_df(df = df, min_rows = ifelse(test = cor_method == "pearson",  :
#                                               the number of rows in 'df' is lower than 30. A multicollinearity analysis may fail or yield meaningless results.

It does not fail if you instead use df <- df[1:12, ], or if you exclude any of the variables.

Within vif_df(), the error seems to happen inside the tryCatch, at:

solve(cor.matrix, tol = tolerance)

# Error in solve.default(cor.matrix, tol = tolerance) : 
#   Lapack routine dgesv: system is exactly singular: U[13,13] = 0

This error may indeed be caused by perfect correlations, but that doesn't seem to be the cause here. For a more informative error message, I suppose you could add before the tryCatch within vif_df():

if (det(cor.matrix) == 0) stop("The variable correlation matrix is singular, so it cannot be solved.")

But ideally there would be an alternative way to compute the result in these cases?

Regards!

AMBarbosa commented 1 month ago

Clues from computing VIF with other packages: with both {usdm} and {fuzzySim}, the VIF is infinite for all variables (and the R-squared is 1), as each variable is perfectly explained by the combination of all others. What are the odds of this happening in an actual ecological dataset?

names(df)

# [1] "var3"  "var4"  "var9"  "var2"  "var13" "var7"  "var1"  "var8"  "var5" 
# [10] "var10" "var12" "var11" "var6" 

# [ variables as reordered inside vif_select() before being passed to vif_df() ]

usdm::vif(df)

#    Variables VIF
# 1       var3 Inf
# 2       var4 Inf
# 3       var9 Inf
# 4       var2 Inf
# 5      var13 Inf
# 6       var7 Inf
# 7       var1 Inf
# 8       var8 Inf
# 9       var5 Inf
# 10     var10 Inf
# 11     var12 Inf
# 12     var11 Inf
# 13      var6 Inf

fuzzySim::multicol(df)

#       Rsquared Tolerance VIF
# var3         1         0 Inf
# var4         1         0 Inf
# var9         1         0 Inf
# var2         1         0 Inf
# var13        1         0 Inf
# var7         1         0 Inf
# var1         1         0 Inf
# var8         1         0 Inf
# var5         1         0 Inf
# var10        1         0 Inf
# var12        1         0 Inf
# var11        1         0 Inf
# var6         1         0 Inf
heatmap(cor(df))

heatmap

AMBarbosa commented 1 month ago

I realize now that this particular data frame has exactly the same number of rows as predictors, in which case their R-squared is indeed 100% (thus VIF infinite). I hadn't noticed this, and what confused me was that the vif_df() error doesn't happen when I reduce the number of rows to even less:

collinear::vif_df(df[1:6, ])

#    variable          vif
# 1      var6 0.000000e+00
# 2      var7 3.904729e+14
# 3      var4 6.476413e+14
# 4      var9 1.828220e+15
# 5      var1 1.903784e+15
# 6      var5 4.072936e+15
# 7      var2 6.624570e+15
# 8     var10 1.040626e+16
# 9     var12 1.050840e+16
# 10     var8 1.156818e+16
# 11    var11 1.221745e+16
# 12    var13 1.362184e+16
# 13     var3 3.553314e+16
# Warning message:
#   In validate_df(df = df, min_rows = 30) :
#   the number of rows in 'df' is lower than 30. A multicollinearity analysis may fail or yield meaningless results.

Whereas:

fuzzySim::multicol(df[1:6, ])

#       Rsquared Tolerance VIF
# var3         1         0 Inf
# var4         1         0 Inf
# var9         1         0 Inf
# var2         1         0 Inf
# var13        1         0 Inf
# var7         1         0 Inf
# var1         1         0 Inf
# var8         1         0 Inf
# var5         1         0 Inf
# var10        1         0 Inf
# var12        1         0 Inf
# var11        1         0 Inf
# var6         1         0 Inf

vif_df() produces very large but not infinite values, and not all equal to each other, so I think this is a bug. I don't think the warning about <30 rows is enough to prevent mistakes or misuse. So, I would add, before computing cor.matrix within vif_df(), something like:

df <- df[, predictors.numeric, drop = FALSE]
if(nrow(df) <= ncol(df)) stop("No more data points than predictors, so VIF is infinite for all predictors.")
BlasBenito commented 1 month ago

Dear @AMBarbosa, Thanks for the report. That's an issue that bugged me during development, and I took a shortcut to address it, thinking that it would be a rare event in ecological data. Let me check it in detail, and I will get back at you as soon as possible. Best, Blas

BlasBenito commented 1 month ago

Most of the issues you have reported should be fixed with version 1.1.2, now in the development branch.

However, as shown below, I could not replicate the singular matrix problem described in this issue. The fact that the function solve() (called within vif_df()) is returning large integers instead of Inf like fuzzySim::multicol() and usdm::vif() might come down to the matrix decomposition method used by lm(). A vif larger than a given threshold is a valid diagnostic, no matter how large the reported vif is, so I cannot address any bugs here.

Still, your comment about the message warning of perfect correlations has been addressed, as not all failures in solve() come from such issue. Additionally, validate_df() now warns the user when the data frame dimensions might cause issues during the multicollinearity analysis.

Thanks a lot for your time testing this package!

Best,

Blas

> df <- structure(list(var3 = c(-2.47126191613724, -1.28045397739883, 
+                               -2.87943315893136, -2.39505691242115, -3.52312642864623, -2.15787376714855, 
+                               -2.8470635995551, -3.40621084907316, -1.82084801567374, -2.89613462680119, 
+                               -3.09295280686465, -1.54788702795955, 0.197716811658935), var4 = c(-1.08580714992212, 
+                                                                                                  -0.872818251287383, 0.338808598563681, -1.3592415601649, -1.13893772547255, 
+                                                                                                  -0.895329184841717, -0.736786740600295, -1.283865177828, -1.76003359460473, 
+                                                                                                  -0.941573502380344, -1.62995926634296, -1.60124171841818, -0.0777887868580415
+                               ), var9 = c(-0.499059586130419, -0.548405050601784, -0.602135355161387, 
+                                           -1.24102651506362, -0.781283220413801, -0.847883011063252, -0.744918212071736, 
+                                           -1.37533655145803, 0.0428460256878719, -1.6367346291579, -0.453181067037663, 
+                                           -0.776049296438161, -1.43481797357956), var2 = c(1.61724031174641, 
+                                                                                            1.37117010796349, 1.12537378915446, 1.59745859986818, 0.654943754764419, 
+                                                                                            0.926529580498604, 1.80409453277929, 0.682222989926673, 0.644254244524167, 
+                                                                                            2.44851929669553, 0.881815656889134, 0.719922484314521, 1.40464231098472
+                                           ), var13 = c(7.41040266538704, 13.5312583954783, 7.41040266538704, 
+                                                        7.41040266538704, 8.59618919764273, 7.41040266538704, 13.5312583954783, 
+                                                        7.41040266538704, 7.41040266538704, 7.41040266538704, 12.7635158716984, 
+                                                        7.41040266538704, 12.7635158716984), var7 = c(-0.59723293760077, 
+                                                                                                      -0.376209757038625, -2.12490054154511, -1.06663178472767, -1.99739835379414, 
+                                                                                                      -0.829909212350894, -1.08729797610347, -1.94584018113948, -1.41076728315705, 
+                                                                                                      -1.98272047148696, -1.27098858018749, -1.92633528579173, -1.53504019121613
+                                                        ), var1 = c(3.95160651959727, 5.66322715573043, 4.78428682906746, 
+                                                                    3.95160651959727, 8.64699826466527, 5.22375699239895, 2.21685587486771, 
+                                                                    6.26460737923668, 7.32858777467081, 5.10810694941697, 13.4349100441188, 
+                                                                    17.8296116774337, 4.43733670012155), var8 = c(-0.26816575471459, 
+                                                                                                                  -2.22109405865227, -0.684925127360919, -0.610437202620312, -0.992706989333577, 
+                                                                                                                  -2.22109405865227, -2.22109405865227, -1.18471391327617, -0.0453480011081498, 
+                                                                                                                  -2.22109405865227, -2.22109405865227, 0.458106246115031, -0.668273210278581
+                                                                    ), var5 = c(-2.13956432564926, -2.53280664751937, -1.98388315078687, 
+                                                                                -1.98901371885298, -2.1294980103762, -2.53280664751937, -1.79970373457131, 
+                                                                                -2.28463259624605, -1.80597162662216, -2.53280664751937, -2.53280664751937, 
+                                                                                -1.88967135860789, -1.44655372200994), var10 = c(1.68807994597818, 
+                                                                                                                                 2.18335753902296, 1.71114038431881, 1.01868449149111, 1.23601260419722, 
+                                                                                                                                 2.19923133723404, 1.48807907883862, 1.51345225278989, 0.851562581385538, 
+                                                                                                                                 1.94244266486479, 1.99509237036138, 0.825553744909574, 1.88026593557423
+                                                                                ), var12 = c(10.7085516923292, 10.7085516923292, 7.58484953479799, 
+                                                                                             7.58484953479799, 7.58484953479799, 7.58484953479799, 10.7085516923292, 
+                                                                                             10.7085516923292, 7.58484953479799, 10.7085516923292, 7.58484953479799, 
+                                                                                             10.7085516923292, 10.7085516923292), var11 = c(1.39792107350584, 
+                                                                                                                                            0.531589122501453, 0.0122424604371142, -0.331594575349984, 0.21151285716957, 
+                                                                                                                                            -0.140321572340113, 0.243592808161356, -0.0668651403382106, 0.0183053082573835, 
+                                                                                                                                            0.143598127729515, -0.122690138644377, 0.00103031167210884, 0.0162193243659431
+                                                                                             ), var6 = c(5.36188342506708, -0.515563106726135, 5.03519119718127, 
+                                                                                                         3.07867425065343, 7.50496027705085, -0.771442186896905, -0.594395224388743, 
+                                                                                                         3.15879730601383, 4.53699767490813, -0.467873570172355, -0.671221992183987, 
+                                                                                                         0.996382238141532, 0.238651826800801)), class = "data.frame", row.names = c(NA, 
+                                                                                                                                                                                     -13L))
> 
> 
> vif_df(df)
The number of rows in 'df' should be higher than 30 to ensure a successful multicollinearity analysis.
   variable              vif
1      var3     789261567120
2      var4   29827966032132
3      var9  356323676019566
4      var2  542963800777175
5     var13  631532969336265
6      var7  866389312101253
7      var1 1189716474071954
8      var8 1204104894968436
9      var5 1284861471669189
10    var10 1294584939394963
11    var12 1312969098046344
12    var11 2227985683922812
13     var6 3132938871214258