rsquaredacademy / olsrr

Tools for developing OLS regression models
https://olsrr.rsquaredacademy.com/
Other
102 stars 22 forks source link

Incompatibility of ols_step_both_p with dplyr #192

Closed bappa10085 closed 2 years ago

bappa10085 commented 2 years ago

I am trying to use ols_step_both_p with group_by function of dplyr package. I am using the following code

library(tidyverse)
library(olsrr)

df <- read.xlsx("assimilation.xlsx", sheet = "Sheet3")
head(df,2)  
str(df)
summary(df)

#Do linear regression for all the locations
model = df %>% 
  mutate(Location = factor(Location, levels = unique(Location))) %>% 
  group_by(Location) %>%
  do(mod = lm(y ~ x1 + x2 + x3 + x4 + x5 + x6 + x7 + x8 + x9 + x10 + x11 + x12, data = df, 
              na.action=na.omit)) 

#Do stepwise regression for 1st location
ols_step_both_p(model$mod[[1]], pent = 0.05, prem = 0.1)

#Do stepwise regression for all location
k <- lapply(model$mod, ols_step_both_p, pent = 0.05, prem = 0.1)

If we see the output, they are the same for both locations.

df = structure(list(Location = c("Location 1", "Location 1", "Location 1", 
"Location 1", "Location 1", "Location 1", "Location 1", "Location 1", 
"Location 1", "Location 1", "Location 1", "Location 1", "Location 1", 
"Location 1", "Location 1", "Location 1", "Location 1", "Location 1", 
"Location 1", "Location 1", "Location 2", "Location 2", "Location 2", 
"Location 2", "Location 2", "Location 2", "Location 2", "Location 2", 
"Location 2", "Location 2", "Location 2", "Location 2", "Location 2", 
"Location 2", "Location 2", "Location 2", "Location 2", "Location 2", 
"Location 2", "Location 2"), y = c(1.65954284204268, 1.58123919015159, 
2.05973017000125, 2.18912673315495, 2.01224505269543, 1.99259958467057, 
2.00078847394452, 1.36897959183673, 1.52340971947847, 0.0261531145981931, 
0.154664774742817, 0.790430042398546, 1.28059309309309, 0.974354066985646, 
1.20366598778004, 1.39269070394898, 1.40758547008547, 1.61010461852513, 
1.62170385395538, 1.62471511775133, 2.38461538461538, 2.42220884742702, 
2.41693907875186, 2.81167789541226, 2.74944214217405, 2.21060782036392, 
2.55440414507772, 2.96888447533929, 2.63223300970874, 2.91519143680527, 
2.7768625982778, 3.56561085972851, 3.11382113821138, 2.9851919008764, 
3.31187669990934, 2.82333333333333, 3.63553943789665, 2.26956784527047, 
2.92354185554548, NA), x1 = c(0, 0, 271.72, 138.49, 0, 9.78, 
59.25, 0, 132.37, 29.14, 127.09, 26.35, 36.34, 22.58, 85.59, 
0, 0, 358.06, 0, 0, 3.33, 0, 0, 0, 5.62, 3.33, 3.23, 16.58, 85.6, 
72.73, 48.72, 29.21, 16.67, 63.12, 53.33, 40, 89.46, 76.35, 16.66, 
47.0397916666667), x2 = c(13.12, 0, 145.91, 131.29, 6.56, 141.93, 
174.52, 0, 104.95, 133.98, 182.68, 121.07, 128.49, 87.64, 99.25, 
0, 0, 124.63, 0, 3.33, 19.58, 32.58, 51.58, 27.4, 12.68, 29.91, 
72.37, 9.34, 22.82, 55.76, 25.9, 11.93, 25.9, 13.33, 29.46, 29.56, 
19.68, 46.24, 19.9, 39.13625), x3 = c(223.84, 59.36, 0, 3.33, 
81.72, 6.45, 13.01, 196.98, 0, 0, 0, 0, 3.33, 19.89, 3.23, 79.25, 
118.17, 3.23, 118.28, 121.5, 87.01, 70.46, 54.29, 70.97, 51.81, 
84.69, 66.75, 104.27, 29.21, 52.54, 34.51, 103.44, 139.87, 85.16, 
101.93, 82.36, 79.25, 49.14, 91.62, 43.7104166666667), x4 = c(44.04, 
31.43, 3.33, 0, 16.35, 0, 9.68, 108.28, 22.8, 9.9, 6.67, 6.67, 
0, 10, 0, 19.57, 26.01, 9.9, 32.58, 26.13, 58.46, 12.67, 29.16, 
73.39, 65.45, 68.92, 22.93, 107.04, 97.63, 94.2, 132.53, 112.47, 
95.03, 127.42, 107.53, 107.96, 71.72, 68.38, 123.87, 47.4670833333333
), x5 = c(34.12, 35.23, 39.8, 31, 33.8, 31.6, 33.96, 34.7, 33.2, 
32, 32.3, 32.7, 30.72, 33.44, 31.2, 33.03, 33.73, 32.5, 34.57, 
32.83, 33.5, 32.6, 33.3, 33.3, 32.9, 34.9, 32.4, 34.4, 34, 34.3, 
33.6, 35.7, 33.4, 34.5, 34.2, 35.2, 33.6, 34.6, 34.5, 32.7333333333333
), x6 = c(25.99, 25.55, 22.2, 25.6, 25.36, 24.8, 25.7, 26.4, 
22.3, 23.3, 22.8, 23.1, 25.16, 25.59, 24.82, 25.24, 25.93, 25.11, 
25.54, 25.54, 24.5, 24.4, 25.7, 25.7, 26.9, 24.1, 24.4, 24.4, 
25.4, 24, 23.5, 24.3, 24.1, 23.9, 25.1, 27.2, 25, 24.4, 24.5, 
23.628125), x7 = c(26.26, 26.7, 20.7, 21.1, 25.71, 21.6, 20, 
26.48, 22.9, 21.3, 21.2, 21.5, 22.3, 21.6, 21, 27.12, 27.23, 
22.6, 27.09, 26.29, 24.2, 25.2, 21.9, 22.6, 25.3, 23.4, 22.6, 
25.6, 25, 21.3, 19, 24.7, 21.1, 23.6, 22.7, 20, 20.4, 22.8, 24.2, 
24.10625), x8 = c(21.34, 21.95, 12, 16.9, 22.47, 19, 18, 22.86, 
15.2, 18.2, 17.2, 14.7, 17.7, 19.2, 16.3, 21.88, 21.55, 14.5, 
22.09, 22.1, 16.9, 18.6, 16.9, 19.2, 17.9, 19, 18.6, 17.4, 13.6, 
10, 13.3, 13.3, 14.2, 12, 11.9, 11.5, 10, 12.8, 13.7, 16.5791666666667
), x9 = c(5.46, 5.99, 8.2875, 7.2175, 6.18, 5.5925, 6.3025, 6.565, 
8.2725, 6.4175, 7.0825, 6.915, 6.89, 6.4475, 7.15, 5.945, 6.165, 
10.925, 6.0325, 5.52, 7.7925, 7.5525, 7.4025, 7.985, 8.505, 8.06, 
7.3125, 8.775, 9.615, 9.2025, 8.9075, 8.8675, 8.315, 9.1525, 
8.965, 8.9225, 9.475, 8.87, 8.6175, 8.30661458333333), x10 = c(0, 
0.25, 8.25, 6.25, 0, 10.75, 6, 0, 5.75, 4, 6.75, 4.25, 4.5, 2.5, 
5.5, 0, 0.25, 5.5, 0, 0, 7.75, 8.5, 6.75, 8.75, 10, 5.75, 10.25, 
9.75, 15, 10.25, 9.5, 10.75, 11.5, 12.25, 13.5, 10, 14.25, 13.25, 
11.25, 6.5), x11 = c(66.96, 77.19, 454, 60.5, 40.8, 61.6, 96.5, 
48.37, 100.7, 105.8, 48.5, 70.5, 118.2, 79, 56.6, 60.4, 80.1, 
119.07, 72.92, 213.39, 65.9, 54.2, 50.6, 98.1, 106.3, 77.8, 39.7, 
79.8, 33.2, 71, 49.3, 74, 97, 55.4, 84.8, 116, 51.7, 87.6, 35.2, 
51.4166666666667), x12 = c(227.07, 183.81, 570.7, 160.3, 145.07, 
136.2, 153.9, 149.17, 333.6, 224.4, 106.5, 138.05, 185.8, 176.64, 
227.9, 151.13, 261.67, 273.67, 196.28, 513.67, 156.6, 136.8, 
79.5, 200.6, 286.4, 157.6, 86.3, 164.6, 74.2, 120.7, 111.5, 104.5, 
189.1, 150, 139.2, 206, 82.2, 140.5, 137.6, 101.529166666667)), row.names = c(NA, 
40L), class = "data.frame")

If you apply ols_step_both_p for a single location it works fine like

#Stepwise regression for single station
model <- lm(y ~ ., data = df1)
k <- ols_step_both_p(model)

# final model
k$model
df1 = structure(list(y = c(1.659542842, 1.58123919, 2.05973017, 2.189126733, 
2.012245053, 1.992599585, 2.000788474, 1.368979592, 1.523409719, 
0.026153115, 0.154664775, 0.790430042, 1.280593093, 0.974354067, 
1.203665988, 1.392690704, 1.40758547, 1.610104619, 1.621703854, 
1.624715118), x1 = c(0, 0, 271.72, 138.49, 0, 9.78, 59.25, 0, 
132.37, 29.14, 127.09, 26.35, 36.34, 22.58, 85.59, 0, 0, 358.06, 
0, 0), x2 = c(13.12, 0, 145.91, 131.29, 6.56, 141.93, 174.52, 
0, 104.95, 133.98, 182.68, 121.07, 128.49, 87.64, 99.25, 0, 0, 
124.63, 0, 3.33), x3 = c(223.84, 59.36, 0, 3.33, 81.72, 6.45, 
13.01, 196.98, 0, 0, 0, 0, 3.33, 19.89, 3.23, 79.25, 118.17, 
3.23, 118.28, 121.5), x4 = c(44.04, 31.43, 3.33, 0, 16.35, 0, 
9.68, 108.28, 22.8, 9.9, 6.67, 6.67, 0, 10, 0, 19.57, 26.01, 
9.9, 32.58, 26.13), x5 = c(34.12, 35.23, 39.8, 31, 33.8, 31.6, 
33.96, 34.7, 33.2, 32, 32.3, 32.7, 30.72, 33.44, 31.2, 33.03, 
33.73, 32.5, 34.57, 32.83), x6 = c(25.99, 25.55, 22.2, 25.6, 
25.36, 24.8, 25.7, 26.4, 22.3, 23.3, 22.8, 23.1, 25.16, 25.59, 
24.82, 25.24, 25.93, 25.11, 25.54, 25.54), x7 = c(26.26, 26.7, 
20.7, 21.1, 25.71, 21.6, 20, 26.48, 22.9, 21.3, 21.2, 21.5, 22.3, 
21.6, 21, 27.12, 27.23, 22.6, 27.09, 26.29), x8 = c(21.34, 21.95, 
12, 16.9, 22.47, 19, 18, 22.86, 15.2, 18.2, 17.2, 14.7, 17.7, 
19.2, 16.3, 21.88, 21.55, 14.5, 22.09, 22.1), x9 = c(5.46, 5.99, 
8.2875, 7.2175, 6.18, 5.5925, 6.3025, 6.565, 8.2725, 6.4175, 
7.0825, 6.915, 6.89, 6.4475, 7.15, 5.945, 6.165, 10.925, 6.0325, 
5.52), x10 = c(0, 0.25, 8.25, 6.25, 0, 10.75, 6, 0, 5.75, 4, 
6.75, 4.25, 4.5, 2.5, 5.5, 0, 0.25, 5.5, 0, 0), x11 = c(66.96, 
77.19, 454, 60.5, 40.8, 61.6, 96.5, 48.37, 100.7, 105.8, 48.5, 
70.5, 118.2, 79, 56.6, 60.4, 80.1, 119.07, 72.92, 213.39), x12 = c(227.07, 
183.81, 570.7, 160.3, 145.07, 136.2, 153.9, 149.17, 333.6, 224.4, 
106.5, 138.05, 185.8, 176.64, 227.9, 151.13, 261.67, 273.67, 
196.28, 513.67)), class = "data.frame", row.names = c(NA, -20L
))

Why is this not working for all the locations when using with dplyr?

Edit I have further dig it down and find that if I use data = . in do(mod = lm(y ~ x1 + x2 + x3 + x4 + x5 + x6 + x7 + x8 + x9 + x10 + x11 + x12, data = ., na.action=na.omit)), then the outputs for 2 locations are different. But ols_step_both_p(model$mod[[1]], pent = 0.05, prem = 0.1) returns me following error

Error in eval(model$call$data) : object '.' not found

If I use data = df in do(mod = lm(y ~ x1 + x2 + x3 + x4 + x5 + x6 + x7 + x8 + x9 + x10 + x11 + x12, data = df, na.action=na.omit)), then the error does not appear but output of model$mod[[1]] and model$mod[[2]] are same. How to remove the following error

Error in eval(model$call$data) : object '.' not found

aravindhebbali commented 2 years ago

Let me look into this and get back to you. We have had issues with eval() in a few other cases and looks like this might be related as well.

aravindhebbali commented 2 years ago

Please try with the development version of olsrr from GitHub and let me know if the error persists.

bappa10085 commented 2 years ago

Using the development version of olsrr from GitHub, it is working. Thank you very much for your help.