pbreheny / biglasso

biglasso: Extending Lasso Model Fitting to Big Data in R
http://pbreheny.github.io/biglasso/
113 stars 29 forks source link

data storage mode affects model fit -- why?? #54

Closed tabpeter closed 7 months ago

tabpeter commented 7 months ago
# TKP 
# April 2024 
# Objective: explore differences between model fits in-memory and filebacked
# remotes::install_github("YaohuiZeng/biglasso")
library(reprex)
library(biglasso)
#> Loading required package: bigmemory
#> Loading required package: Matrix
#> Loading required package: ncvreg
library(ncvreg)
# load colon data ----------------------------------------------------------------
data(colon)
X <- colon$X |> ncvreg::std()
# X <- cbind(1, X)
xtx <- apply(X, 2, crossprod)
init <- rep(0, ncol(X)) # cold starts - use more iterations (default is 1000)
y <- colon$y
resid <- drop(y - X %*% init)
X.bm <- as.big.matrix(X)

# take 1 -----------------------------
# file-backed model fit 
fit1 <- biglasso_fit(X = X.bm, y = y, lambda = 0.05, xtx = xtx, r = resid,
                     penalty = "lasso", max.iter = 10000)

# compare with `ncvreg::ncvfit()` -- runs in memory 
fit2 <- ncvfit(X = X, y = y, lambda = 0.05, xtx = xtx, r = resid,
               penalty = "lasso", max.iter = 10000)

# test coefficients 
tinytest::expect_equal(fit1$beta, fit2$beta, tolerance = 0.01)
#> ----- FAILED[data]: <-->
#>  call| tinytest::expect_equal(fit1$beta, fit2$beta, tolerance = 0.01)
#>  diff| Mean absolute difference: 0.02752312

# test residuals
tinytest::expect_equal(fit1$resid, fit2$resid, tolerance = 0.01)
#> ----- PASSED      : <-->
#>  call| tinytest::expect_equal(fit1$resid, fit2$resid, tolerance = 0.01)

str(fit1) # note: iter = 3773
#> List of 11
#>  $ beta          : Named num [1:2000] 0 0 0 0 0 0 0 0 0 0 ...
#>   ..- attr(*, "names")= chr [1:2000] "Hsa.3004" "Hsa.13491" "Hsa.13491.1" "Hsa.37254" ...
#>  $ iter          : int 3773
#>  $ resid         : num [1:62] 0.987 0.617 0.905 0.202 0.793 ...
#>  $ lambda        : num 0.05
#>  $ penalty       : chr "lasso"
#>  $ alpha         : num 1
#>  $ loss          : num 29.5
#>  $ penalty.factor: num [1:2000] 1 1 1 1 1 1 1 1 1 1 ...
#>  $ n             : num 62
#>  $ y             : num [1:62] 1 0 1 0 1 0 1 0 1 0 ...
#>  $ time          : num 0.35
#>  - attr(*, "class")= chr [1:2] "biglasso" "ncvreg"
str(fit2) # note: iter = 2
#> List of 10
#>  $ beta          : Named num [1:2000] 0 0 0 0 0 0 0 0 0 0 ...
#>   ..- attr(*, "names")= chr [1:2000] "Hsa.3004" "Hsa.13491" "Hsa.13491.1" "Hsa.37254" ...
#>  $ iter          : int 2
#>  $ resid         : num [1:62] 0.987 0.617 0.905 0.202 0.793 ...
#>  $ lambda        : num 0.05
#>  $ penalty       : chr "lasso"
#>  $ gamma         : num 3
#>  $ alpha         : num 1
#>  $ loss          : num 29.5
#>  $ penalty.factor: num [1:2000] 1 1 1 1 1 1 1 1 1 1 ...
#>  $ n             : int 62

nz1 <- which(fit1$beta != 0)
fit1$beta[nz1]
#>      Hsa.8147     Hsa.36689      Hsa.3152     Hsa.36665     Hsa.42949 
#> -0.0322178250 -0.1131494353  0.0218359134  0.0224074245 -0.0043473699 
#>     Hsa.692.2      Hsa.1272       Hsa.166     Hsa.31801      Hsa.3648 
#> -0.1202222748 -0.0109727114  0.0107157905  0.0414907591  0.0007487836 
#>     Hsa.13628      Hsa.3016      Hsa.5392   Hsa.35496.1      Hsa.1832 
#>  0.0022106709  0.0157166112  0.0360233297  0.0107837452 -0.0180740994 
#>     Hsa.12241     Hsa.44244      Hsa.2928     Hsa.41159     Hsa.33268 
#> -0.0082794639 -0.0176171036  0.0036267952  0.0105440705 -0.0482435225 
#>      Hsa.6814      Hsa.1660      Hsa.1491   Hsa.41098.1 
#>  0.0302496905  0.0469252260  0.0107054818 -0.0234533426

nz2 <- which(fit2$beta != 0)
fit2$beta[nz2]
#>      Hsa.3152     Hsa.36665     Hsa.42949     Hsa.692.2     Hsa.13628 
#>  1.722814e-06  4.248432e-08 -9.890970e-08 -1.908319e-07  9.282081e-07 
#>      Hsa.3016      Hsa.1832     Hsa.12241     Hsa.44244      Hsa.2928 
#>  5.391427e-07 -2.866869e-07 -5.207912e-07 -8.897555e-07  5.288186e-07 
#>     Hsa.41159     Hsa.33268      Hsa.6814      Hsa.1660 
#>  1.868370e-07 -2.609362e-07  2.094525e-07  2.229240e-07
# note: there are fewer nonzero betas here compared to fit1

fit1$beta[intersect(nz1, nz2)]
#>     Hsa.3152    Hsa.36665    Hsa.42949    Hsa.692.2    Hsa.13628     Hsa.3016 
#>  0.021835913  0.022407425 -0.004347370 -0.120222275  0.002210671  0.015716611 
#>     Hsa.1832    Hsa.12241    Hsa.44244     Hsa.2928    Hsa.41159    Hsa.33268 
#> -0.018074099 -0.008279464 -0.017617104  0.003626795  0.010544071 -0.048243523 
#>     Hsa.6814     Hsa.1660 
#>  0.030249691  0.046925226
fit2$beta[intersect(nz1, nz2)]
#>      Hsa.3152     Hsa.36665     Hsa.42949     Hsa.692.2     Hsa.13628 
#>  1.722814e-06  4.248432e-08 -9.890970e-08 -1.908319e-07  9.282081e-07 
#>      Hsa.3016      Hsa.1832     Hsa.12241     Hsa.44244      Hsa.2928 
#>  5.391427e-07 -2.866869e-07 -5.207912e-07 -8.897555e-07  5.288186e-07 
#>     Hsa.41159     Hsa.33268      Hsa.6814      Hsa.1660 
#>  1.868370e-07 -2.609362e-07  2.094525e-07  2.229240e-07
# note: these estimates are much smaller than fit1! 

# take 2 -----------------------------
fit1_take2 <- biglasso_fit(X.bm, y, lambda = 0.05, xtx = xtx, r = resid,
                           penalty = "lasso", max.iter = 10000)

# compare with `ncvreg::ncvfit()`
fit2_take2 <- ncvfit(X = X, y = y, lambda = 0.05, xtx = xtx, r = resid,
                     penalty = "lasso", max.iter = 10000)

# test coefficients 
tinytest::expect_equal(fit1_take2$beta, fit2_take2$beta, tolerance = 0.01)
#> ----- PASSED      : <-->
#>  call| tinytest::expect_equal(fit1_take2$beta, fit2_take2$beta, tolerance = 0.01)

# test residuals
tinytest::expect_equal(fit1_take2$resid, fit2_take2$resid, tolerance = 0.01)
#> ----- PASSED      : <-->
#>  call| tinytest::expect_equal(fit1_take2$resid, fit2_take2$resid, tolerance = 0.01)

nz1_take2 <- which(fit1_take2$beta != 0)
fit1_take2$beta[nz1_take2]
#>      Hsa.3152     Hsa.36665     Hsa.42949     Hsa.692.2     Hsa.13628 
#>  1.722814e-06  4.248432e-08 -9.890970e-08 -1.908319e-07  9.282081e-07 
#>      Hsa.3016      Hsa.1832     Hsa.12241     Hsa.44244      Hsa.2928 
#>  5.391427e-07 -2.866869e-07 -5.207912e-07 -8.897555e-07  5.288186e-07 
#>     Hsa.41159     Hsa.33268      Hsa.6814      Hsa.1660 
#>  1.868370e-07 -2.609362e-07  2.094525e-07  2.229240e-07

nz2_take2 <- which(fit2_take2$beta != 0)
fit1_take2$beta[nz2_take2]
#>      Hsa.3152     Hsa.42949     Hsa.692.2     Hsa.13628      Hsa.3016 
#>  1.722814e-06 -9.890970e-08 -1.908319e-07  9.282081e-07  5.391427e-07 
#>      Hsa.1832     Hsa.12241     Hsa.44244      Hsa.2928     Hsa.41159 
#> -2.866869e-07 -5.207912e-07 -8.897555e-07  5.288186e-07  1.868370e-07 
#>     Hsa.33268      Hsa.6814      Hsa.1660 
#> -2.609362e-07  2.094525e-07  2.229240e-07

fit1_take2$beta[intersect(nz1_take2, nz2_take2)]
#>      Hsa.3152     Hsa.42949     Hsa.692.2     Hsa.13628      Hsa.3016 
#>  1.722814e-06 -9.890970e-08 -1.908319e-07  9.282081e-07  5.391427e-07 
#>      Hsa.1832     Hsa.12241     Hsa.44244      Hsa.2928     Hsa.41159 
#> -2.866869e-07 -5.207912e-07 -8.897555e-07  5.288186e-07  1.868370e-07 
#>     Hsa.33268      Hsa.6814      Hsa.1660 
#> -2.609362e-07  2.094525e-07  2.229240e-07
fit2_take2$beta[intersect(nz1_take2, nz2_take2)]
#>      Hsa.3152     Hsa.42949     Hsa.692.2     Hsa.13628      Hsa.3016 
#>  1.638265e-06 -7.992673e-08 -1.577083e-07  9.061937e-07  4.678115e-07 
#>      Hsa.1832     Hsa.12241     Hsa.44244      Hsa.2928     Hsa.41159 
#> -2.529658e-07 -5.153089e-07 -8.802163e-07  4.484431e-07  1.429543e-07 
#>     Hsa.33268      Hsa.6814      Hsa.1660 
#> -2.589283e-07  1.462757e-07  1.735474e-07
# now all the estimated coefficients are just close to 0

# Questions: 
#   (1) why is it that in take 1, the test of residuals passes but the test of beta doesn't?
#   (2) why is it that in take 2, the beta coefficients are different? 
#       The data passed to the functions is the same. Nothing is stochastic here....
#   (3) why does running the same code produce different answers the second time I run it? (Related to issue 2)

Created on 2024-04-12 with reprex v2.1.0