kunstler / MollierChap1PhD

0 stars 0 forks source link

Speed up stepAIC selection by moving to lasso #2

Closed kunstler closed 4 years ago

kunstler commented 4 years ago
  library(glmnet)
  #convert training data to matrix format
  s_time <- Sys.time()
  x <- model.matrix(response~  Dist_lisiere_act + Dist_route + FORMATION + STRUCTURE + Altitude +
                      poly(Altitude, 2) + Pente + poly(Pente, 2)+ 
                      exposition + poly(exposition, 2) + pH + poly(pH, 2) +
                      phosphore + poly(phosphore, 2)+
                      Azote + poly(Azote, 2)+
                      limons + argile,test_fit)[, -1]
  #perform grid search to find optimal value of lambda
  #family= binomial => logistic regression, alpha=1 => lasso
  cvfit = cv.glmnet(x, test_fit$response, family = "binomial", type.measure = "auc")
  #plot result
  var_lasso <- as.matrix(coef(cvfit, s = "lambda.1se"))
  vars <- gsub( '(2\\)).*', '\\1', row.names(var_lasso)[var_lasso>0])
  vars <- vars[! vars == "(Intercept)"]

  glm_envir1 <- glm(paste("response ~  ", paste(vars,  collapse = " + ")) ,
                    family = "binomial", data = test_fit)
  e_time <- Sys.time()
  e_time - s_time

takes 2.8 seconds compared to

 # Modele nul
  glm_nul1 <-
    glm(response ~ 1, family = "binomial", data = test_fit)

  # Modele avec variables environnementales seules
  start_time <- Sys.time()
  glm_envir1 <-
    step(
      glm_nul1,
      ~ .  + Dist_lisiere_act + Dist_route + FORMATION + STRUCTURE + Altitude +
        poly(Altitude, 2) + Pente + poly(Pente, 2)
      + exposition + poly(exposition, 2) + pH + poly(pH, 2)
      + phosphore + poly(phosphore, 2)
      + Azote + poly(Azote, 2)
      + limons + argile,
      direction = "both",
      trace = 0
    )
  end_time <- Sys.time()
  # Ajout du type de foret comme predicteur
  glm_type_F1 <- update(glm_envir1, ~ . + TYPE_FORET)
  end_time2 <- Sys.time()
  end_time - start_time
  end_time2 - start_time

which takes 13.9 secondes

But mixing lasso and glm might be weird

kunstler commented 4 years ago

AIC comparison in full glmnet ?

 library(glmnet)
 #convert training data to matrix format
 s_time <- Sys.time()
 x <- model.matrix(response~  Dist_lisiere_act + Dist_route + FORMATION + STRUCTURE + Altitude +
                     poly(Altitude, 2) + Pente + poly(Pente, 2)+ 
                     exposition + poly(exposition, 2) + pH + poly(pH, 2) +
                     phosphore + poly(phosphore, 2)+
                     Azote + poly(Azote, 2)+
                     limons + argile,test_fit)[, -1]
 #perform grid search to find optimal value of lambda
 #family= binomial => logistic regression, alpha=1 => lasso
 cvfit = cv.glmnet(x, test_fit$response, family = "binomial", type.measure = "auc")
 #plot result
 var_lasso <- as.matrix(coef(cvfit, s = "lambda.1se"))
 vars <- gsub( '(2\\)).*', '\\1', row.names(var_lasso)[var_lasso>0])
 vars <- vars[! vars == "(Intercept)"]

 x1 <- model.matrix(eval(parse(text=paste("response ~  ", paste(c(vars),  collapse = " + ")))),
                         data = test_fit)[, -1] 
 x2 <- model.matrix(eval(parse(text=paste("response ~  ", paste(c("TYPE_FORET", vars),  collapse = " + ")))),
                    test_fit)[, -1] 
 glmnet_envir1l <- glmnet(x1, test_fit$response, family = "binomial",
                        data = test_fit, lambda = 0)
 glmnet_type_F1l <- glmnet(x2, test_fit$response, family = "binomial",
                        data = test_fit, lambda = 0)

 AIC_glmnet <- function(fit){
   tLL <- fit$nulldev - deviance(fit)
   k <- fit$df
   AIC <- -tLL+2*k
   AIC   
 }
 AIC_glmnet(glmnet_envir1l)
 AIC_glmnet(glmnet_type_F1l)
 AIC_glmnet(glmnet_type_F1l) + 5 <  AIC_glmnet(glmnet_envir1l)
 e_time <- Sys.time()
 e_time - s_time

gives

>  library(glmnet)
>  #convert training data to matrix format
>  s_time <- Sys.time()
>  x <- model.matrix(response~  Dist_lisiere_act + Dist_route + FORMATION + STRUCTURE + Altitude +
+                      poly(Altitude, 2) + Pente + poly(Pente, 2)+ 
+                      exposition + poly(exposition, 2) + pH + poly(pH, 2) +
+                      phosphore + poly(phosphore, 2)+
+                      Azote + poly(Azote, 2)+
+                      limons + argile,test_fit)[, -1]
>  #perform grid search to find optimal value of lambda
>  #family= binomial => logistic regression, alpha=1 => lasso
>  cvfit = cv.glmnet(x, test_fit$response, family = "binomial", type.measure = "auc")
>  #plot result
>  var_lasso <- as.matrix(coef(cvfit, s = "lambda.1se"))
>  vars <- gsub( '(2\\)).*', '\\1', row.names(var_lasso)[var_lasso>0])
>  vars <- vars[! vars == "(Intercept)"]
> 
>  x1 <- model.matrix(eval(parse(text=paste("response ~  ", paste(c(vars),  collapse = " + ")))),
+                          data = test_fit)[, -1] 
>  x2 <- model.matrix(eval(parse(text=paste("response ~  ", paste(c("TYPE_FORET", vars),  collapse = " + ")))),
+                     test_fit)[, -1] 
>  glmnet_envir1l <- glmnet(x1, test_fit$response, family = "binomial",
+                         data = test_fit, lambda = 0)
>  glmnet_type_F1l <- glmnet(x2, test_fit$response, family = "binomial",
+                         data = test_fit, lambda = 0)
>  
>  AIC_glmnet <- function(fit){
+    tLL <- fit$nulldev - deviance(fit)
+    k <- fit$df
+    AIC <- -tLL+2*k
+    AIC   
+  }
>  AIC_glmnet(glmnet_envir1l)
[1] -308.7293
>  AIC_glmnet(glmnet_type_F1l)
[1] -324.8918
>  
>  e_time <- Sys.time()
>  e_time - s_time
Time difference of 3.296071 secs
>  AIC_glmnet(glmnet_type_F1l) + 5 <  AIC_glmnet(glmnet_envir1l)
[1] TRUE
kunstler commented 4 years ago
> AIC(glm_type_F1) - AIC(glm_envir1)
[1] -26.61796
> AIC_glmnet(glmnet_type_F1l) - AIC_glmnet(glmnet_envir1l)
[1] -16.1625

Both lasso glmnet and stepAIC glm give a better AIC for model wiith Foret Recente

kunstler commented 4 years ago

stick to glm