Closed tabpeter closed 7 months ago
# TKP # April 2024 # Objective: explore differences between model fits in-memory and filebacked # remotes::install_github("YaohuiZeng/biglasso") library(reprex) library(biglasso) #> Loading required package: bigmemory #> Loading required package: Matrix #> Loading required package: ncvreg library(ncvreg) # load colon data ---------------------------------------------------------------- data(colon) X <- colon$X |> ncvreg::std() # X <- cbind(1, X) xtx <- apply(X, 2, crossprod) init <- rep(0, ncol(X)) # cold starts - use more iterations (default is 1000) y <- colon$y resid <- drop(y - X %*% init) X.bm <- as.big.matrix(X) # take 1 ----------------------------- # file-backed model fit fit1 <- biglasso_fit(X = X.bm, y = y, lambda = 0.05, xtx = xtx, r = resid, penalty = "lasso", max.iter = 10000) # compare with `ncvreg::ncvfit()` -- runs in memory fit2 <- ncvfit(X = X, y = y, lambda = 0.05, xtx = xtx, r = resid, penalty = "lasso", max.iter = 10000) # test coefficients tinytest::expect_equal(fit1$beta, fit2$beta, tolerance = 0.01) #> ----- FAILED[data]: <--> #> call| tinytest::expect_equal(fit1$beta, fit2$beta, tolerance = 0.01) #> diff| Mean absolute difference: 0.02752312 # test residuals tinytest::expect_equal(fit1$resid, fit2$resid, tolerance = 0.01) #> ----- PASSED : <--> #> call| tinytest::expect_equal(fit1$resid, fit2$resid, tolerance = 0.01) str(fit1) # note: iter = 3773 #> List of 11 #> $ beta : Named num [1:2000] 0 0 0 0 0 0 0 0 0 0 ... #> ..- attr(*, "names")= chr [1:2000] "Hsa.3004" "Hsa.13491" "Hsa.13491.1" "Hsa.37254" ... #> $ iter : int 3773 #> $ resid : num [1:62] 0.987 0.617 0.905 0.202 0.793 ... #> $ lambda : num 0.05 #> $ penalty : chr "lasso" #> $ alpha : num 1 #> $ loss : num 29.5 #> $ penalty.factor: num [1:2000] 1 1 1 1 1 1 1 1 1 1 ... #> $ n : num 62 #> $ y : num [1:62] 1 0 1 0 1 0 1 0 1 0 ... #> $ time : num 0.35 #> - attr(*, "class")= chr [1:2] "biglasso" "ncvreg" str(fit2) # note: iter = 2 #> List of 10 #> $ beta : Named num [1:2000] 0 0 0 0 0 0 0 0 0 0 ... #> ..- attr(*, "names")= chr [1:2000] "Hsa.3004" "Hsa.13491" "Hsa.13491.1" "Hsa.37254" ... #> $ iter : int 2 #> $ resid : num [1:62] 0.987 0.617 0.905 0.202 0.793 ... #> $ lambda : num 0.05 #> $ penalty : chr "lasso" #> $ gamma : num 3 #> $ alpha : num 1 #> $ loss : num 29.5 #> $ penalty.factor: num [1:2000] 1 1 1 1 1 1 1 1 1 1 ... #> $ n : int 62 nz1 <- which(fit1$beta != 0) fit1$beta[nz1] #> Hsa.8147 Hsa.36689 Hsa.3152 Hsa.36665 Hsa.42949 #> -0.0322178250 -0.1131494353 0.0218359134 0.0224074245 -0.0043473699 #> Hsa.692.2 Hsa.1272 Hsa.166 Hsa.31801 Hsa.3648 #> -0.1202222748 -0.0109727114 0.0107157905 0.0414907591 0.0007487836 #> Hsa.13628 Hsa.3016 Hsa.5392 Hsa.35496.1 Hsa.1832 #> 0.0022106709 0.0157166112 0.0360233297 0.0107837452 -0.0180740994 #> Hsa.12241 Hsa.44244 Hsa.2928 Hsa.41159 Hsa.33268 #> -0.0082794639 -0.0176171036 0.0036267952 0.0105440705 -0.0482435225 #> Hsa.6814 Hsa.1660 Hsa.1491 Hsa.41098.1 #> 0.0302496905 0.0469252260 0.0107054818 -0.0234533426 nz2 <- which(fit2$beta != 0) fit2$beta[nz2] #> Hsa.3152 Hsa.36665 Hsa.42949 Hsa.692.2 Hsa.13628 #> 1.722814e-06 4.248432e-08 -9.890970e-08 -1.908319e-07 9.282081e-07 #> Hsa.3016 Hsa.1832 Hsa.12241 Hsa.44244 Hsa.2928 #> 5.391427e-07 -2.866869e-07 -5.207912e-07 -8.897555e-07 5.288186e-07 #> Hsa.41159 Hsa.33268 Hsa.6814 Hsa.1660 #> 1.868370e-07 -2.609362e-07 2.094525e-07 2.229240e-07 # note: there are fewer nonzero betas here compared to fit1 fit1$beta[intersect(nz1, nz2)] #> Hsa.3152 Hsa.36665 Hsa.42949 Hsa.692.2 Hsa.13628 Hsa.3016 #> 0.021835913 0.022407425 -0.004347370 -0.120222275 0.002210671 0.015716611 #> Hsa.1832 Hsa.12241 Hsa.44244 Hsa.2928 Hsa.41159 Hsa.33268 #> -0.018074099 -0.008279464 -0.017617104 0.003626795 0.010544071 -0.048243523 #> Hsa.6814 Hsa.1660 #> 0.030249691 0.046925226 fit2$beta[intersect(nz1, nz2)] #> Hsa.3152 Hsa.36665 Hsa.42949 Hsa.692.2 Hsa.13628 #> 1.722814e-06 4.248432e-08 -9.890970e-08 -1.908319e-07 9.282081e-07 #> Hsa.3016 Hsa.1832 Hsa.12241 Hsa.44244 Hsa.2928 #> 5.391427e-07 -2.866869e-07 -5.207912e-07 -8.897555e-07 5.288186e-07 #> Hsa.41159 Hsa.33268 Hsa.6814 Hsa.1660 #> 1.868370e-07 -2.609362e-07 2.094525e-07 2.229240e-07 # note: these estimates are much smaller than fit1! # take 2 ----------------------------- fit1_take2 <- biglasso_fit(X.bm, y, lambda = 0.05, xtx = xtx, r = resid, penalty = "lasso", max.iter = 10000) # compare with `ncvreg::ncvfit()` fit2_take2 <- ncvfit(X = X, y = y, lambda = 0.05, xtx = xtx, r = resid, penalty = "lasso", max.iter = 10000) # test coefficients tinytest::expect_equal(fit1_take2$beta, fit2_take2$beta, tolerance = 0.01) #> ----- PASSED : <--> #> call| tinytest::expect_equal(fit1_take2$beta, fit2_take2$beta, tolerance = 0.01) # test residuals tinytest::expect_equal(fit1_take2$resid, fit2_take2$resid, tolerance = 0.01) #> ----- PASSED : <--> #> call| tinytest::expect_equal(fit1_take2$resid, fit2_take2$resid, tolerance = 0.01) nz1_take2 <- which(fit1_take2$beta != 0) fit1_take2$beta[nz1_take2] #> Hsa.3152 Hsa.36665 Hsa.42949 Hsa.692.2 Hsa.13628 #> 1.722814e-06 4.248432e-08 -9.890970e-08 -1.908319e-07 9.282081e-07 #> Hsa.3016 Hsa.1832 Hsa.12241 Hsa.44244 Hsa.2928 #> 5.391427e-07 -2.866869e-07 -5.207912e-07 -8.897555e-07 5.288186e-07 #> Hsa.41159 Hsa.33268 Hsa.6814 Hsa.1660 #> 1.868370e-07 -2.609362e-07 2.094525e-07 2.229240e-07 nz2_take2 <- which(fit2_take2$beta != 0) fit1_take2$beta[nz2_take2] #> Hsa.3152 Hsa.42949 Hsa.692.2 Hsa.13628 Hsa.3016 #> 1.722814e-06 -9.890970e-08 -1.908319e-07 9.282081e-07 5.391427e-07 #> Hsa.1832 Hsa.12241 Hsa.44244 Hsa.2928 Hsa.41159 #> -2.866869e-07 -5.207912e-07 -8.897555e-07 5.288186e-07 1.868370e-07 #> Hsa.33268 Hsa.6814 Hsa.1660 #> -2.609362e-07 2.094525e-07 2.229240e-07 fit1_take2$beta[intersect(nz1_take2, nz2_take2)] #> Hsa.3152 Hsa.42949 Hsa.692.2 Hsa.13628 Hsa.3016 #> 1.722814e-06 -9.890970e-08 -1.908319e-07 9.282081e-07 5.391427e-07 #> Hsa.1832 Hsa.12241 Hsa.44244 Hsa.2928 Hsa.41159 #> -2.866869e-07 -5.207912e-07 -8.897555e-07 5.288186e-07 1.868370e-07 #> Hsa.33268 Hsa.6814 Hsa.1660 #> -2.609362e-07 2.094525e-07 2.229240e-07 fit2_take2$beta[intersect(nz1_take2, nz2_take2)] #> Hsa.3152 Hsa.42949 Hsa.692.2 Hsa.13628 Hsa.3016 #> 1.638265e-06 -7.992673e-08 -1.577083e-07 9.061937e-07 4.678115e-07 #> Hsa.1832 Hsa.12241 Hsa.44244 Hsa.2928 Hsa.41159 #> -2.529658e-07 -5.153089e-07 -8.802163e-07 4.484431e-07 1.429543e-07 #> Hsa.33268 Hsa.6814 Hsa.1660 #> -2.589283e-07 1.462757e-07 1.735474e-07 # now all the estimated coefficients are just close to 0 # Questions: # (1) why is it that in take 1, the test of residuals passes but the test of beta doesn't? # (2) why is it that in take 2, the beta coefficients are different? # The data passed to the functions is the same. Nothing is stochastic here.... # (3) why does running the same code produce different answers the second time I run it? (Related to issue 2)
Created on 2024-04-12 with reprex v2.1.0
Created on 2024-04-12 with reprex v2.1.0