Open avinashbarnwal opened 4 years ago
@avinashbarnwal Can you also post the error log?
@hcho3, please find the error -
Error in .const_array(dim, nd, lp): length(x) == prod(dim[subdim]) is not TRUE
Traceback:
1. predict(bm, newdata = trn.data[1:10, ], type = "density", q = q)
2. predict.stmboost(bm, newdata = trn.data[1:10, ], type = "density",
. q = q)
3. cbind(ret, predict(tmpm, newdata = nd, ...))
4. predict(tmpm, newdata = nd, ...)
5. predict.ctm(tmpm, newdata = nd, ...)
6. dmlt(object = object, newdata = newdata, q = q, log = FALSE,
. ...)
7. tmlt(object, newdata = newdata, q = q, ...)
8. predict(object$model, newdata = newdata, coef = coef(object),
. dim = dim, ...)
9. predict.cbind_bases(object$model, newdata = newdata, coef = coef(object),
. dim = dim, ...)
10. predict(object[[b]], newdata = newdata, coef = cf, dim = dim,
. terms = tm, ...)
11. predict.cbind_bases(object[[b]], newdata = newdata, coef = cf,
. dim = dim, terms = tm, ...)
12. predict(object[[b]], newdata = newdata, coef = cf, dim = dim,
. terms = tm, ...)
13. predict.basis(object[[b]], newdata = newdata, coef = cf, dim = dim,
. terms = tm, ...)
14. .const_array(dim, nd, lp)
15. stopifnot(length(x) == prod(dim[subdim]))
What is the content of length(x)
and dim[subdim]
?
These are internal calls. I am not sure about the internals.
Try adding some printing statements in the lines in the error trace.
It is throwing an error when we are calling -
d = predict(bm, newdata = trn.data[1:10,], type = "density", q = q)
IMO the example is still not minimal enough. does the error message happen on all data sets or just this one?
example(stmboost) works for me, > example(stmboost)
stmbst> if (require("TH.data") && require("tram")) {
stmbst+ data("bodyfat", package = "TH.data")
stmbst+
stmbst+ ### estimate unconditional model
stmbst+ m_mlt <- BoxCox(DEXfat ~ 1, data = bodyfat, prob = c(.1, .99))
stmbst+ ### get corresponding in-sample log-likelihood
stmbst+ logLik(m_mlt)
stmbst+
stmbst+ ### estimate conditional transformation model
stmbst+ bm <- stmboost(m_mlt, formula = DEXfat ~ ., data = bodyfat,
stmbst+ method = quote(mboost::mboost))
stmbst+ ### in-sample log-likelihood (NEEDS TUNING OF mstop!)
stmbst+ logLik(bm)
stmbst+
stmbst+ ### evaluate conditional densities for two observations
stmbst+ predict(bm, newdata = bodyfat[1:2,], type = "density")
stmbst+ }
[,1] [,2]
1 6.920333e-25 5.359008e-27
2 1.150292e-22 1.122044e-24
3 1.482642e-20 1.821723e-22
4 1.481872e-18 2.293515e-20
5 1.148503e-16 2.239068e-18
6 6.902403e-15 1.695040e-16
7 3.216734e-13 9.950368e-15
8 1.162456e-11 4.529447e-13
9 2.467597e-10 1.196683e-11
10 3.116104e-09 1.838282e-10
11 2.651180e-08 1.867701e-09
12 1.666780e-07 1.381862e-08
13 8.308695e-07 8.015650e-08
14 3.462558e-06 3.854442e-07
15 1.254178e-05 1.601263e-06
16 4.059005e-05 5.919748e-06
17 1.195920e-04 1.987468e-05
18 3.245622e-04 6.138832e-05
19 8.165382e-04 1.757247e-04
20 1.908895e-03 4.675884e-04
21 4.145105e-03 1.156558e-03
22 8.341802e-03 2.653598e-03
23 1.550743e-02 5.629056e-03
24 2.653558e-02 1.099845e-02
25 4.165873e-02 1.972202e-02
26 5.985350e-02 3.236010e-02
27 7.859573e-02 4.849378e-02
28 9.433554e-02 6.633626e-02
29 1.036698e-01 8.291834e-02
30 1.046601e-01 9.495987e-02
31 9.754643e-02 1.000633e-01
32 8.447584e-02 9.757978e-02
33 6.849797e-02 8.868782e-02
34 5.245685e-02 7.573666e-02
35 3.829501e-02 6.130869e-02
36 2.690697e-02 4.748127e-02
37 1.837055e-02 3.551024e-02
38 1.230042e-02 2.588060e-02
39 8.147312e-03 1.854207e-02
40 5.380786e-03 1.316540e-02
41 3.568720e-03 9.333962e-03
42 2.392047e-03 6.653603e-03
43 1.629333e-03 4.798819e-03
44 1.132763e-03 3.520730e-03
45 8.058217e-04 2.637289e-03
46 5.860539e-04 2.018193e-03
47 4.330897e-04 1.570796e-03
48 3.209706e-04 1.229623e-03
49 2.336194e-04 9.501151e-04
50 1.481507e-04 6.423238e-04
>
this is more minimal
library(tram)
library(tbm)
data("GBSG2", package = "TH.data")
f <- survival::Surv(time, cens) ~ horTh
m_mlt <- Survreg(f, data = GBSG2, dist = "lognormal")
bm <- stmboost(m_mlt, formula = f, data = GBSG2,control = boost_control(mstop=200,nu=0.01,trace=TRUE),method = quote(mboost::mboost))
q = seq(from = 0, to = max(GBSG2$time), length.out = 100)
sessionInfo()
d = predict(bm, newdata = GBSG2, type = "density", q = q)
th798@cmp2986 MINGW64 ~/projects/aft-xgboost
$ R --vanilla < mboost-smaller.R
R version 3.6.1 (2019-07-05) -- "Action of the Toes"
Copyright (C) 2019 The R Foundation for Statistical Computing
Platform: x86_64-w64-mingw32/x64 (64-bit)
R is free software and comes with ABSOLUTELY NO WARRANTY.
You are welcome to redistribute it under certain conditions.
Type 'license()' or 'licence()' for distribution details.
Natural language support but running in an English locale
R is a collaborative project with many contributors.
Type 'contributors()' for more information and
'citation()' on how to cite R or R packages in publications.
Type 'demo()' for some demos, 'help()' for on-line help, or
'help.start()' for an HTML browser interface to help.
Type 'q()' to quit R.
> library(tram)
Loading required package: mlt
Loading required package: basefun
Loading required package: variables
> library(tbm)
Loading required package: mboost
Loading required package: parallel
Loading required package: stabs
This is mboost 2.9-1. See 'package?mboost' and 'news(package = "mboost")'
for a complete list of changes.
> data("GBSG2", package = "TH.data")
> f <- survival::Surv(time, cens) ~ horTh
> m_mlt <- Survreg(f, data = GBSG2, dist = "lognormal")
> bm <- stmboost(m_mlt, formula = f, data = GBSG2,control = boost_control(mstop=200,nu=0.01,trace=TRUE),method = quote(mboost::mboost))
Error in X %*% beta : non-conformable arguments
In addition: Warning message:
In c.basis(bresponse = response, bshifting = shifting) :
more than one basis contains an intercept term
[ 1] ...................................... -- risk: 2614.115
[ 41] ...................................... -- risk: 2614.115
[ 81] ...................................... -- risk: 2614.115
[ 121] ...................................... -- risk: 2614.115
[ 161] ......................................
Final risk: 2614.115
Warning message:
In baselearner(bl) :
cannot compute 'bbs' for non-numeric variables; used 'bols' instead.
> q = seq(from = 0, to = max(GBSG2$time), length.out = 100)
> sessionInfo()
R version 3.6.1 (2019-07-05)
Platform: x86_64-w64-mingw32/x64 (64-bit)
Running under: Windows 10 x64 (build 17134)
Matrix products: default
locale:
[1] LC_COLLATE=English_United States.1252
[2] LC_CTYPE=English_United States.1252
[3] LC_MONETARY=English_United States.1252
[4] LC_NUMERIC=C
[5] LC_TIME=English_United States.1252
attached base packages:
[1] parallel stats graphics grDevices utils datasets methods
[8] base
other attached packages:
[1] tbm_0.3-0 mboost_2.9-1 stabs_0.6-3 tram_0.3-0
[5] mlt_1.1-0 basefun_1.0-6 variables_1.0-2
loaded via a namespace (and not attached):
[1] Rcpp_1.0.2 Formula_1.2-3 BB_2019.10-1
[4] nnls_1.4 splines_3.6.1 MASS_7.3-51.4
[7] orthopolynom_1.0-5 lattice_0.20-38 quadprog_1.5-7
[10] multcomp_1.4-10 alabama_2015.3-1 tools_3.6.1
[13] grid_3.6.1 TH.data_1.0-10 survival_2.44-1.1
[16] inum_1.0-1 libcoin_1.0-4 numDeriv_2016.8-1.1
[19] Matrix_1.2-17 nloptr_1.2.1 codetools_0.2-16
[22] rpart_4.1-15 sandwich_2.5-1 compiler_3.6.1
[25] coneproj_1.14 partykit_1.2-5 polynom_1.4-0
[28] mvtnorm_1.0-11 zoo_1.8-6
> d = predict(bm, newdata = GBSG2, type = "density", q = q)
Error in .const_array(dim, nd, lp) :
length(x) == prod(dim[subdim]) is not TRUE
Calls: predict ... predict -> predict.basis -> .const_array -> stopifnot
Execution halted
]0;MINGW64:/c/Users/th798/projects/aft-xgboost
th798@cmp2986 MINGW64 ~/projects/aft-xgboost
$
here is another MRE with simulated data.
library(tram)
library(tbm)
N <- 100
set.seed(1)
x <- runif(N)
y <- 10+30*x+rnorm(N)
y.lower <- y-runif(N)
y.upper <- y+runif(N)
plot(y ~ x)
segments(x, y.lower, x, y.upper)
df <- data.frame(x, y.lower, y.upper)
f <- survival::Surv(y.lower, y.upper, type="interval2") ~ x
m_mlt <- Survreg(f, data = df, dist = "lognormal")
bm <- stmboost(m_mlt, formula = f, data = df,control = boost_control(mstop=200,nu=0.01,trace=TRUE),method = quote(mboost::mboost))
q = seq(from = min(df$y.lower), to = max(df$y.upper), length.out = 100)
sessionInfo()
d = predict(bm, newdata = df, type = "density", q = q)
output:
th798@cmp2986 MINGW64 ~/projects/aft-xgboost
$ R --vanilla < mboost-smaller.R
R version 3.6.1 (2019-07-05) -- "Action of the Toes"
Copyright (C) 2019 The R Foundation for Statistical Computing
Platform: x86_64-w64-mingw32/x64 (64-bit)
R is free software and comes with ABSOLUTELY NO WARRANTY.
You are welcome to redistribute it under certain conditions.
Type 'license()' or 'licence()' for distribution details.
Natural language support but running in an English locale
R is a collaborative project with many contributors.
Type 'contributors()' for more information and
'citation()' on how to cite R or R packages in publications.
Type 'demo()' for some demos, 'help()' for on-line help, or
'help.start()' for an HTML browser interface to help.
Type 'q()' to quit R.
> library(tram)
Loading required package: mlt
Loading required package: basefun
Loading required package: variables
> library(tbm)
Loading required package: mboost
Loading required package: parallel
Loading required package: stabs
This is mboost 2.9-1. See 'package?mboost' and 'news(package = "mboost")'
for a complete list of changes.
> N <- 100
> set.seed(1)
> x <- runif(N)
> y <- 10+30*x+rnorm(N)
> y.lower <- y-runif(N)
> y.upper <- y+runif(N)
> plot(y ~ x)
> segments(x, y.lower, x, y.upper)
>
> df <- data.frame(x, y.lower, y.upper)
> f <- survival::Surv(y.lower, y.upper, type="interval2") ~ x
> m_mlt <- Survreg(f, data = df, dist = "lognormal")
> bm <- stmboost(m_mlt, formula = f, data = df,control = boost_control(mstop=200,nu=0.01,trace=TRUE),method = quote(mboost::mboost))
Error in X %*% beta : non-conformable arguments
In addition: Warning message:
In c.basis(bresponse = response, bshifting = shifting) :
more than one basis contains an intercept term
[ 1] ...................................... -- risk: 218.0055
[ 41] ...................................... -- risk: 221.4632
[ 81] ...................................... -- risk: 228.7793
[ 121] ...................................... -- risk: 241.061
[ 161] ......................................
Final risk: 261.0727
There were 20 warnings (use warnings() to see them)
> q = seq(from = min(df$y.lower), to = max(df$y.upper), length.out = 100)
> sessionInfo()
R version 3.6.1 (2019-07-05)
Platform: x86_64-w64-mingw32/x64 (64-bit)
Running under: Windows 10 x64 (build 17134)
Matrix products: default
locale:
[1] LC_COLLATE=English_United States.1252
[2] LC_CTYPE=English_United States.1252
[3] LC_MONETARY=English_United States.1252
[4] LC_NUMERIC=C
[5] LC_TIME=English_United States.1252
attached base packages:
[1] parallel stats graphics grDevices utils datasets methods
[8] base
other attached packages:
[1] tbm_0.3-0 mboost_2.9-1 stabs_0.6-3 tram_0.3-0
[5] mlt_1.1-0 basefun_1.0-6 variables_1.0-2
loaded via a namespace (and not attached):
[1] Rcpp_1.0.2 Formula_1.2-3 BB_2019.10-1
[4] nnls_1.4 splines_3.6.1 MASS_7.3-51.4
[7] orthopolynom_1.0-5 lattice_0.20-38 quadprog_1.5-7
[10] multcomp_1.4-10 alabama_2015.3-1 grid_3.6.1
[13] TH.data_1.0-10 survival_2.44-1.1 inum_1.0-1
[16] libcoin_1.0-4 numDeriv_2016.8-1.1 Matrix_1.2-17
[19] nloptr_1.2.1 codetools_0.2-16 rpart_4.1-15
[22] sandwich_2.5-1 compiler_3.6.1 coneproj_1.14
[25] partykit_1.2-5 polynom_1.4-0 mvtnorm_1.0-11
[28] zoo_1.8-6
> d = predict(bm, newdata = df, type = "density", q = q)
Error in .const_array(dim, nd, lp) :
length(x) == prod(dim[subdim]) is not TRUE
Calls: predict ... predict -> predict.basis -> .const_array -> stopifnot
Execution halted
]0;MINGW64:/c/Users/th798/projects/aft-xgboost
th798@cmp2986 MINGW64 ~/projects/aft-xgboost
$
Thanks @tdhock.
Torsten's new code works for me @avinashbarnwal
library(tram)
library(tbm)
N <- 100
set.seed(1)
x <- runif(N, min = -.5, max = .5)
### I made this a log-normal DGP
y <- exp(1+2*x+rnorm(N))
### smaller than 0 is not allowed
y.lower <- pmax(y-runif(N), 0.001)
y.upper <- y+runif(N)
plot(y ~ x)
segments(x, y.lower, x, y.upper)
df <- data.frame(x, y.lower, y.upper)
### saver: generate a new variable with a simple name
### because the deparsed call might be a trouble maker
df$y <- survival::Surv(y.lower, y.upper, type="interval2")
### the model is UNCONDITIONAL (no x here)
### (you can have conditional models, but this is more complex)
m_mlt <- Survreg(y ~ 1, data = df, dist = "lognormal")
### but boosting needs the x
f <- y ~ x
bm <- stmboost(m_mlt, formula = f, data = df,control = boost_control(mstop=200,nu=0.01,trace=TRUE),method = quote(mboost::mboost))
q = seq(from = min(df$y.lower), to = max(df$y.upper), length.out = 10)
sessionInfo()
### this is dim(grid) x dim(df), so obs are in columns !!!
dim(d <- predict(bm, newdata = df, type = "density", q = q))
Above is the contents of toby.R script from Torsten. Below is the output of running that in R:
th798@cmp2986 MINGW64 ~/projects/neuroblastoma-data (master)
$ R -e "install.packages(c('tram', 'tbm'))"
R version 3.6.1 (2019-07-05) -- "Action of the Toes"
Copyright (C) 2019 The R Foundation for Statistical Computing
Platform: x86_64-w64-mingw32/x64 (64-bit)
R is free software and comes with ABSOLUTELY NO WARRANTY.
You are welcome to redistribute it under certain conditions.
Type 'license()' or 'licence()' for distribution details.
Natural language support but running in an English locale
R is a collaborative project with many contributors.
Type 'contributors()' for more information and
'citation()' on how to cite R or R packages in publications.
Type 'demo()' for some demos, 'help()' for on-line help, or
'help.start()' for an HTML browser interface to help.
Type 'q()' to quit R.
> install.packages(c('tram', 'tbm'))
Installing packages into 'C:/Users/th798/R/win-library/3.6'
(as 'lib' is unspecified)
trying URL 'http://cloud.r-project.org/bin/windows/contrib/3.6/tram_0.3-0.zip'
Content type 'application/zip' length 958381 bytes (935 KB)
==================================================
downloaded 935 KB
trying URL 'http://cloud.r-project.org/bin/windows/contrib/3.6/tbm_0.3-1.zip'
Content type 'application/zip' length 4087877 bytes (3.9 MB)
==================================================
downloaded 3.9 MB
package 'tram' successfully unpacked and MD5 sums checked
package 'tbm' successfully unpacked and MD5 sums checked
The downloaded binary packages are in
C:\Users\th798\AppData\Local\Temp\RtmpiGTnSc\downloaded_packages
>
>
]0;MINGW64:/c/Users/th798/projects/neuroblastoma-data
th798@cmp2986 MINGW64 ~/projects/neuroblastoma-data (master)
$ R --vanilla < ~/Downloads/toby.R
R version 3.6.1 (2019-07-05) -- "Action of the Toes"
Copyright (C) 2019 The R Foundation for Statistical Computing
Platform: x86_64-w64-mingw32/x64 (64-bit)
R is free software and comes with ABSOLUTELY NO WARRANTY.
You are welcome to redistribute it under certain conditions.
Type 'license()' or 'licence()' for distribution details.
Natural language support but running in an English locale
R is a collaborative project with many contributors.
Type 'contributors()' for more information and
'citation()' on how to cite R or R packages in publications.
Type 'demo()' for some demos, 'help()' for on-line help, or
'help.start()' for an HTML browser interface to help.
Type 'q()' to quit R.
>
> library(tram)
Loading required package: mlt
Loading required package: basefun
Loading required package: variables
> library(tbm)
Loading required package: mboost
Loading required package: parallel
Loading required package: stabs
This is mboost 2.9-1. See 'package?mboost' and 'news(package = "mboost")'
for a complete list of changes.
> N <- 100
> set.seed(1)
> x <- runif(N, min = -.5, max = .5)
> ### I made this a log-normal DGP
> y <- exp(1+2*x+rnorm(N))
> ### smaller than 0 is not allowed
> y.lower <- pmax(y-runif(N), 0.001)
> y.upper <- y+runif(N)
> plot(y ~ x)
> segments(x, y.lower, x, y.upper)
>
> df <- data.frame(x, y.lower, y.upper)
> ### saver: generate a new variable with a simple name
> ### because the deparsed call might be a trouble maker
> df$y <- survival::Surv(y.lower, y.upper, type="interval2")
> ### the model is UNCONDITIONAL (no x here)
> ### (you can have conditional models, but this is more complex)
> m_mlt <- Survreg(y ~ 1, data = df, dist = "lognormal")
> ### but boosting needs the x
> f <- y ~ x
> bm <- stmboost(m_mlt, formula = f, data = df,control = boost_control(mstop=200,nu=0.01,trace=TRUE),method = quote(mboost::mboost))
[ 1] ...................................... -- risk: 263.5698
[ 41] ...................................... -- risk: 258.8081
[ 81] ...................................... -- risk: 256.2437
[ 121] ...................................... -- risk: 254.8281
[ 161] ......................................
Final risk: 254.0211
Warning message:
In c.basis(bresponse = response, bshifting = shifting) :
more than one basis contains an intercept term
> q = seq(from = min(df$y.lower), to = max(df$y.upper), length.out = 10)
> sessionInfo()
R version 3.6.1 (2019-07-05)
Platform: x86_64-w64-mingw32/x64 (64-bit)
Running under: Windows 10 x64 (build 17134)
Matrix products: default
locale:
[1] LC_COLLATE=English_United States.1252
[2] LC_CTYPE=English_United States.1252
[3] LC_MONETARY=English_United States.1252
[4] LC_NUMERIC=C
[5] LC_TIME=English_United States.1252
attached base packages:
[1] parallel stats graphics grDevices utils datasets methods
[8] base
other attached packages:
[1] tbm_0.3-1 mboost_2.9-1 stabs_0.6-3 tram_0.3-0
[5] mlt_1.1-0 basefun_1.0-6 variables_1.0-2
loaded via a namespace (and not attached):
[1] Rcpp_1.0.2 Formula_1.2-3 BB_2019.10-1
[4] nnls_1.4 splines_3.6.1 MASS_7.3-51.4
[7] orthopolynom_1.0-5 lattice_0.20-38 quadprog_1.5-7
[10] multcomp_1.4-10 alabama_2015.3-1 grid_3.6.1
[13] TH.data_1.0-10 survival_2.44-1.1 inum_1.0-1
[16] libcoin_1.0-4 numDeriv_2016.8-1.1 Matrix_1.2-17
[19] nloptr_1.2.1 codetools_0.2-16 rpart_4.1-15
[22] sandwich_2.5-1 compiler_3.6.1 coneproj_1.14
[25] partykit_1.2-5 polynom_1.4-0 mvtnorm_1.0-11
[28] zoo_1.8-6
> ### this is dim(grid) x dim(df), so obs are in columns !!!
> dim(d <- predict(bm, newdata = df, type = "density", q = q))
[1] 10 100
>
>
>
>
]0;MINGW64:/c/Users/th798/projects/neuroblastoma-data
th798@cmp2986 MINGW64 ~/projects/neuroblastoma-data (master)
$
Hi Prof. @tdhock,
I am not sure why we have different y\~1 and y\~x.
m_mlt <- Survreg(y ~ 1, data = df, dist = "lognormal") f <- y ~ x
I generally keep the same formula for both the places. Check the snippets of the code -
my.surv = Surv(y.lower,y.upper,type='interval2')
formula = as.formula(paste("my.surv ~", paste(colnames(X),collapse="+")))
m_mlt = Survreg(formula, data = trn.data, dist = "lognormal")
bm = stmboost(m_mlt, formula = formula, data = trn.data,control = boost_control(mstop=200,nu=0.001,trace=TRUE),method = quote(mboost::mboost))
I am feeling we should make software, where we can reproduce these things more clearly. Things are very hay-wire.
Hi Prof. @tdhock,
I have updated the minimal reproducible example code and I have changed.
m_mlt <- Survreg(y ~ 1, data = df, dist = "lognormal")
to
m_mlt <- Survreg(y ~ x, data = df, dist = "lognormal")
Then it gave the same error as before.
Error in .const_array(dim, nd, lp): length(x) == prod(dim[subdim]) is not TRUE
Traceback:
1. predict(bm, newdata = df, type = "density", q = q)
2. predict.stmboost(bm, newdata = df, type = "density", q = q)
3. cbind(ret, predict(tmpm, newdata = nd, ...))
4. predict(tmpm, newdata = nd, ...)
5. predict.ctm(tmpm, newdata = nd, ...)
6. dmlt(object = object, newdata = newdata, q = q, log = FALSE,
. ...)
7. tmlt(object, newdata = newdata, q = q, ...)
8. predict(object$model, newdata = newdata, coef = coef(object),
. dim = dim, ...)
9. predict.cbind_bases(object$model, newdata = newdata, coef = coef(object),
. dim = dim, ...)
10. predict(object[[b]], newdata = newdata, coef = cf, dim = dim,
. terms = tm, ...)
11. predict.cbind_bases(object[[b]], newdata = newdata, coef = cf,
. dim = dim, terms = tm, ...)
12. predict(object[[b]], newdata = newdata, coef = cf, dim = dim,
. terms = tm, ...)
13. predict.basis(object[[b]], newdata = newdata, coef = cf, dim = dim,
. terms = tm, ...)
14. .const_array(dim, nd, lp)
15. stopifnot(length(x) == prod(dim[subdim]))
Also for
m_mlt <- Survreg(y ~ 1, data = df, dist = "lognormal")
d is null.
Check this notebook - https://github.com/avinashbarnwal/aftXgboostPaper/blob/master/src/R/testing/testing_mboost/mboost_MRE1.ipynb
I am not sure whats the difference between y\~x and y\~1 in Survreg.
Hi Prof. @tdhock,
Please let me know if you have got the time to look into this issue.
I think the two formulas are for different purposes. In Torsten's email he indicated that y ~ 1 should be used in one case and not the other. So please do that in your code.
Thanks for the update Prof. @tdhock. I will use it accordingly. But i am confused what the difference? Do you think we should ask Prof. Torsten and also predicted values from y~1 are NAs?
I don't know what is the difference, you should ask Torsten for clarification. but I don't think it is relevant to what you are doing. I think you should just copy/modify the code he sent us, and that should be fine for baseline predictions.
Yes, but even using y~1 predictions are nulls.
I don't understand / can't replicate the nulls you describe. When I do Survreg(y ~ 1)
and stmboost(formula = y ~ inputs)
I get real-valued predictions. For example this R code
library(tram)
library(tbm)
N <- 100
set.seed(1)
x <- runif(N, min = -.5, max = .5)
### I made this a log-normal DGP
y <- exp(1+2*x+rnorm(N))
### smaller than 0 is not allowed
y.lower <- pmax(y-runif(N), 0.001)
y.upper <- y+runif(N)
plot(y ~ x)
segments(x, y.lower, x, y.upper)
df <- data.frame(x, y.lower, y.upper)
### saver: generate a new variable with a simple name
### because the deparsed call might be a trouble maker
df$y <- survival::Surv(y.lower, y.upper, type="interval2")
m_mlt <- Survreg(
y ~ 1, #DO NOT CHANGE THIS.
data = df,
dist = "lognormal")
bm <- stmboost(
m_mlt,
formula = y ~ x, #CHANGE x to all input variables.
data = df,
control = boost_control(mstop=200,nu=0.01,trace=TRUE),
method = quote(mboost::mboost))
pred.mat <- predict(
bm,
newdata = df,
type = "density",
q = 0.5)#help(predict.mlt) q: quantiles at which to evaluate the model
pred.mat[, 1:5]
sessionInfo()
I get the following result:
th798@cmp2986 MINGW64 ~/projects/aft-xgboost
$ R --vanilla < mboost-pred.R
R version 3.6.1 (2019-07-05) -- "Action of the Toes"
Copyright (C) 2019 The R Foundation for Statistical Computing
Platform: x86_64-w64-mingw32/x64 (64-bit)
R is free software and comes with ABSOLUTELY NO WARRANTY.
You are welcome to redistribute it under certain conditions.
Type 'license()' or 'licence()' for distribution details.
Natural language support but running in an English locale
R is a collaborative project with many contributors.
Type 'contributors()' for more information and
'citation()' on how to cite R or R packages in publications.
Type 'demo()' for some demos, 'help()' for on-line help, or
'help.start()' for an HTML browser interface to help.
Type 'q()' to quit R.
> library(tram)
Loading required package: mlt
Loading required package: basefun
Loading required package: variables
> library(tbm)
Loading required package: mboost
Loading required package: parallel
Loading required package: stabs
This is mboost 2.9-1. See 'package?mboost' and 'news(package = "mboost")'
for a complete list of changes.
> N <- 100
> set.seed(1)
> x <- runif(N, min = -.5, max = .5)
> ### I made this a log-normal DGP
> y <- exp(1+2*x+rnorm(N))
> ### smaller than 0 is not allowed
> y.lower <- pmax(y-runif(N), 0.001)
> y.upper <- y+runif(N)
> plot(y ~ x)
> segments(x, y.lower, x, y.upper)
>
> df <- data.frame(x, y.lower, y.upper)
> ### saver: generate a new variable with a simple name
> ### because the deparsed call might be a trouble maker
> df$y <- survival::Surv(y.lower, y.upper, type="interval2")
> m_mlt <- Survreg(
+ y ~ 1, #DO NOT CHANGE THIS.
+ data = df,
+ dist = "lognormal")
> bm <- stmboost(
+ m_mlt,
+ formula = y ~ x, #CHANGE x to all input variables.
+ data = df,
+ control = boost_control(mstop=200,nu=0.01,trace=TRUE),
+ method = quote(mboost::mboost))
[ 1] ...................................... -- risk: 263.5698
[ 41] ...................................... -- risk: 258.8081
[ 81] ...................................... -- risk: 256.2437
[ 121] ...................................... -- risk: 254.8281
[ 161] ......................................
Final risk: 254.0211
Warning message:
In c.basis(bresponse = response, bshifting = shifting) :
more than one basis contains an intercept term
> pred.mat <- predict(
+ bm,
+ newdata = df,
+ type = "density",
+ q = 0.5)#help(predict.mlt) q: quantiles at which to evaluate the model
> pred.mat[, 1:5]
[1] 0.40957453 0.28031297 0.14472058 0.03636722 0.48420437
> sessionInfo()
R version 3.6.1 (2019-07-05)
Platform: x86_64-w64-mingw32/x64 (64-bit)
Running under: Windows 10 x64 (build 17134)
Matrix products: default
locale:
[1] LC_COLLATE=English_United States.1252
[2] LC_CTYPE=English_United States.1252
[3] LC_MONETARY=English_United States.1252
[4] LC_NUMERIC=C
[5] LC_TIME=English_United States.1252
attached base packages:
[1] parallel stats graphics grDevices utils datasets methods
[8] base
other attached packages:
[1] tbm_0.3-1 mboost_2.9-1 stabs_0.6-3 tram_0.3-0
[5] mlt_1.1-0 basefun_1.0-6 variables_1.0-2
loaded via a namespace (and not attached):
[1] Rcpp_1.0.2 Formula_1.2-3 BB_2019.10-1
[4] nnls_1.4 splines_3.6.1 MASS_7.3-51.4
[7] orthopolynom_1.0-5 lattice_0.20-38 quadprog_1.5-7
[10] multcomp_1.4-10 alabama_2015.3-1 grid_3.6.1
[13] TH.data_1.0-10 survival_2.44-1.1 inum_1.0-1
[16] libcoin_1.0-4 numDeriv_2016.8-1.1 Matrix_1.2-17
[19] nloptr_1.2.1 codetools_0.2-16 rpart_4.1-15
[22] sandwich_2.5-1 compiler_3.6.1 coneproj_1.14
[25] partykit_1.2-5 polynom_1.4-0 mvtnorm_1.0-11
[28] zoo_1.8-6
>
]0;MINGW64:/c/Users/th798/projects/aft-xgboost
th798@cmp2986 MINGW64 ~/projects/aft-xgboost
$
the important part with the real-valued predictions is
> pred.mat[, 1:5]
[1] 0.40957453 0.28031297 0.14472058 0.03636722 0.48420437
Hi Prof. @tdhock,
I am still finding null values. Below is the code -
library(tram)
library(tbm)
N <- 100
set.seed(1)
x <- runif(N, min = -.5, max = .5)
### I made this a log-normal DGP
y <- exp(1+2*x+rnorm(N))
### smaller than 0 is not allowed
y.lower <- pmax(y-runif(N), 0.001)
y.upper <- y+runif(N)
plot(y ~ x)
segments(x, y.lower, x, y.upper)
df <- data.frame(x, y.lower, y.upper)
### saver: generate a new variable with a simple name
### because the deparsed call might be a trouble maker
df$y <- survival::Surv(y.lower, y.upper, type="interval2")
m_mlt <- Survreg(
y ~ 1, #DO NOT CHANGE THIS.
data = df,
dist = "lognormal")
bm <- stmboost(
m_mlt,
formula = y ~ x, #CHANGE x to all input variables.
data = df,
control = boost_control(mstop=200,nu=0.01,trace=TRUE),
method = quote(mboost::mboost))
pred.mat <- predict(
bm,
newdata = df,
type = "density",
q = 0.5)#help(predict.mlt) q: quantiles at which to evaluate the model
pred.mat[, 1:5]
[ 1] ...................................... -- risk: 263.5698
[ 41] ...................................... -- risk: 258.8081
[ 81] ...................................... -- risk: 256.2437
[ 121] ...................................... -- risk: 254.8281
[ 161] ......................................
Final risk: 254.0211
Warning message in c.basis(bresponse = response, bshifting = shifting):
“more than one basis contains an intercept term”
<NA>
<NA>
<NA>
<NA>
<NA>
sessionInfo()
R version 3.6.1 (2019-07-05)
Platform: x86_64-apple-darwin15.6.0 (64-bit)
Running under: macOS Sierra 10.12.6
Matrix products: default
BLAS: /Library/Frameworks/R.framework/Versions/3.6/Resources/lib/libRblas.0.dylib
LAPACK: /Library/Frameworks/R.framework/Versions/3.6/Resources/lib/libRlapack.dylib
locale:
[1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
attached base packages:
[1] parallel stats graphics grDevices utils datasets methods
[8] base
other attached packages:
[1] tbm_0.3-1 mboost_2.9-1 stabs_0.6-3 tram_0.3-1
[5] mlt_1.0-6 basefun_1.0-5 variables_1.0-2
loaded via a namespace (and not attached):
[1] Rcpp_1.0.2 pillar_1.4.2 compiler_3.6.1
[4] base64enc_0.1-3 tools_3.6.1 rpart_4.1-15
[7] partykit_1.2-5 digest_0.6.20 uuid_0.1-2
[10] jsonlite_1.6 evaluate_0.14 lattice_0.20-38
[13] rlang_0.4.0 Matrix_1.2-17 IRdisplay_0.7.0
[16] IRkernel_1.0.2.9000 polynom_1.4-0 mvtnorm_1.0-11
[19] libcoin_1.0-5 repr_1.0.1 BB_2014.10-1
[22] grid_3.6.1 survival_2.44-1.1 orthopolynom_1.0-5
[25] multcomp_1.4-10 pbdZMQ_0.3-3 TH.data_1.0-10
[28] alabama_2015.3-1 Formula_1.2-3 nnls_1.4
[31] codetools_0.2-16 htmltools_0.3.6 splines_3.6.1
[34] MASS_7.3-51.4 numDeriv_2016.8-1.1 quadprog_1.5-7
[37] sandwich_2.5-1 coneproj_1.14 inum_1.0-1
[40] crayon_1.3.4 zoo_1.8-6
I am not sure how you are getting different results?
Hi Prof. @tdhock ,
We are running on a different OS. I am using Mac and you are using Windows. Also, my version for the tram is latest - tram_0.3-1. You are using tram_0.3-0. Please let me know if you think these are the reasons for difference in results.
Hi Prof. @tdhock,
I have been able to reproduce the minimal reproducible example in Linux. This is AWS EC2 provided by @hcho3. Link - https://github.com/avinashbarnwal/aftXgboostPaper/blob/master/src/R/production/mboost/mboost.ipynb
Hi Prof. @tdhock and @hcho3,
Please find the minimal reproducible example to test the predict function of mboost -