Closed zali37 closed 1 week ago
Hello there !
You have indeed some overfitting with RF and some trouble with SRE. With the code you send, you try to tune the different models but you don't give the tuned options to BIOMOD_Modeling 👀
Could you try with this code:
bibronii_format <- BIOMOD_FormatingData(
resp.name = "bibronii",
resp.var = bibronii$presence,
resp.xy = bibronii[, c("lon", "lat")],
expl.var = stk_current,
filter.raster = TRUE
)
cv.k <- bm_CrossValidation(bm.format = bibronii_format,
strategy = 'kfold',
nb.rep = 2,
k = 3)
opt.d <- bm_ModelingOptions(
data.type = "binary",
models=c("SRE","RF","GAM","ANN"),
calib.lines = cv.k,
bm.format = bibronii_format,
strategy = "default")
tuned.rf <- bm_Tuning(model = 'RF',
tuning.fun = 'rf',
do.formula = TRUE,
bm.options = opt.d@options$RF.binary.randomForest.randomForest,
bm.format = bibronii_format)
tuned.gam <- bm_Tuning(model = 'GAM',
tuning.fun = 'gam',
do.formula = TRUE,
do.stepAIC = TRUE,
bm.options = opt.d@options$GAM.binary.mgcv.gam,
bm.format = bibronii_format)
tuned.ann <- bm_Tuning(model = "ANN",
tuning.fun = "avNNet",
do.formula = T,
bm.options = opt.d@options$ANN.binary.nnet.nnet,
bm.format = bibronii_format)
opt.sre <- list('_allData_allRun' = list(quant = 0.1))
bibronii_opt <- bm_ModelingOptions(
data.type = "binary",
models=c("SRE","RF", "GAM", "ANN"),
bm.format = bibronii_format,
strategy = "user.defined",
user.base = "bigboss",
user.val <- list(SRE.binary.biomod2.bm_SRE = opt.sre,
RF.binary.randomForest.randomForest = tuned.rf,
GAM.binary.mgcv.gam = tuned.gam,
ANN.binary.nnet.nnet = tuned.ann)
)
bibronii_modelo <- BIOMOD_Modeling(
bm.format = bibronii_format,
modeling.id = "solo_bibronii",
models = c("SRE", "RF", "GAM", "ANN"),
CV.strategy = "random",
CV.nb.rep = 5,
CV.perc = 0.8,
var.import = 3,
OPT.strategy = "user.defined",
OPT.user = bibronii_opt
metric.eval = c("TSS", "BIAS")
)
If the RF's overfitting persists, you will need to change the parameters manually (such as SRE). Note that you have very few absences compared to the number of presences: this can be the cause of the overfitting 😬
Hope it helps ! Hélène
Hi Helene, thank you for your answer I made the changes you proposed to the script but I got another error with the modeling options:
-=-=-=-=-=-=-=-=-=-=-= Build Modeling Options -=-=-=-=-=-=-=-=-=-=-= Error in .fun_testIfIn(TRUE, "names(user.val)", names(user.val), avail.options.list) :
names(user.val) must be 'ANN.binary.nnet.nnet', 'CTA.binary.rpart.rpart', 'FDA.binary.mda.fda', 'GAM.binary.gam.gam', 'GAM.binary.mgcv.bam', 'GAM.binary.mgcv.gam', 'GBM.binary.gbm.gbm', 'GLM.binary.stats.glm', 'MARS.binary.earth.earth', 'MAXENT.binary.MAXENT.MAXENT', 'MAXNET.binary.maxnet.maxnet', 'RF.binary.randomForest.randomForest', 'SRE.binary.biomod2.bm_SRE' or 'XGBOOST.binary.xgboost.xgboost In addition: Warning message: In .bm_ModelingOptions.check.args(data.type = data.type, models = models, : Only one GAM model can be activated. 'GAM.mgcv.gam' has been set (other available : 'GAM.gam.gam' or 'GAM.mgcv.bam')
I appreciate your help! Natalia
Hi Natalia,
I made a typing error ! It should be better with:
bibronii_opt <- bm_ModelingOptions(
data.type = "binary",
models=c("SRE","RF", "GAM", "ANN"),
bm.format = bibronii_format,
strategy = "user.defined",
user.base = "bigboss",
user.val = list(SRE.binary.biomod2.bm_SRE = opt.sre,
RF.binary.randomForest.randomForest = tuned.rf,
GAM.binary.mgcv.gam = tuned.gam,
ANN.binary.nnet.nnet = tuned.ann)
)
bibronii_modelo <- BIOMOD_Modeling(
bm.format = bibronii_format,
modeling.id = "solo_bibronii",
models = c("SRE", "RF", "GAM", "ANN"),
CV.strategy = "random",
CV.nb.rep = 5,
CV.perc = 0.8,
var.import = 3,
OPT.strategy = "user.defined",
OPT.user = bibronii_opt,
metric.eval = c("TSS", "BIAS")
)
Sorry 😄
Hélène
Ok, thank you! But now I get this other error
-=-=-=-=-=-=-=-=-=-=-= Build Modeling Options -=-=-=-=-=-=-=-=-=-=-=
> SRE options (datatype: binary , package: biomod2 , function: bm_SRE )...
> RF options (datatype: binary , package: randomForest , function: randomForest )...
> GAM options (datatype: binary , package: mgcv , function: gam )...
> ANN options (datatype: binary , package: nnet , function: nnet )...Error in { : task 1 failed - "task 1 failed - "
names(user.val) must be '_allData_allRun'"" In addition: Warning message: In .bm_ModelingOptions.check.args(data.type = data.type, models = models, : Only one GAM model can be activated. 'GAM.mgcv.gam' has been set (other available : 'GAM.gam.gam' or 'GAM.mgcv.bam')
Natalia
Hello Natalia,
I must have lost track at some point 😬 Let's go back a bit.
Can you share the output of :
print(opt.sre)
print(tuned.rf)
print(tuned.gam)
print(tuned.ann)
bm_ModelingOptions
Thanks a lot ! Hélène
Hi Helene. Thank you so much Here it is
> print(opt.sre)
$allData_allRun
$allData_allRun$quant
[1] 0.1
> print(user.rf)
$allData_allRun
$allData_allRun$ntre
[1] 1000
$allData_allRun$sampsize
[1] 130
$allData_allRun$replace
[1] TRUE
> print(tuned.gam)
$`_allData_allRun`
$`_allData_allRun`$formula
bibronii ~ 1 + clip_bio2 + clip_bio5 + clip_bio6 + clip_bio15
<environment: 0x000001ec037bacd0>
$`_allData_allRun`$family
Family: binomial
Link function: logit
$`_allData_allRun`$data
list()
$`_allData_allRun`$weights
NULL
$`_allData_allRun`$subset
NULL
$`_allData_allRun`$na.action
$`_allData_allRun`$offset
NULL
$`_allData_allRun`$method
[1] ML
Levels: GCV.Cp GACV.Cp REML P-REML ML P-ML
$`_allData_allRun`$optimizer
c("outer", "newton")
$`_allData_allRun`$control
$`_allData_allRun`$control$nthreads
[1] 1
$`_allData_allRun`$control$ncv.threads
[1] 1
$`_allData_allRun`$control$irls.reg
[1] 0
$`_allData_allRun`$control$epsilon
[1] 1e-07
$`_allData_allRun`$control$maxit
[1] 200
$`_allData_allRun`$control$trace
[1] FALSE
$`_allData_allRun`$control$mgcv.tol
[1] 1e-07
$`_allData_allRun`$control$mgcv.half
[1] 15
$`_allData_allRun`$control$rank.tol
[1] 1.490116e-08
$`_allData_allRun`$control$nlm
$`_allData_allRun`$control$nlm$ndigit
[1] 7
$`_allData_allRun`$control$nlm$gradtol
[1] 1e-06
$`_allData_allRun`$control$nlm$stepmax
[1] 2
$`_allData_allRun`$control$nlm$steptol
[1] 1e-04
$`_allData_allRun`$control$nlm$iterlim
[1] 200
$`_allData_allRun`$control$nlm$check.analyticals
[1] FALSE
$`_allData_allRun`$control$optim
$`_allData_allRun`$control$optim$factr
[1] 1e+07
$`_allData_allRun`$control$newton
$`_allData_allRun`$control$newton$conv.tol
[1] 1e-06
$`_allData_allRun`$control$newton$maxNstep
[1] 5
$`_allData_allRun`$control$newton$maxSstep
[1] 2
$`_allData_allRun`$control$newton$maxHalf
[1] 30
$`_allData_allRun`$control$newton$use.svd
[1] FALSE
$`_allData_allRun`$control$idLinksBases
[1] TRUE
$`_allData_allRun`$control$scalePenalty
[1] TRUE
$`_allData_allRun`$control$efs.lspmax
[1] 15
$`_allData_allRun`$control$efs.tol
[1] 0.1
$`_allData_allRun`$control$keepData
[1] FALSE
$`_allData_allRun`$control$scale.est
[1] "fletcher"
$`_allData_allRun`$control$edge.correct
[1] FALSE
$`_allData_allRun`$scale
[1] 0
$`_allData_allRun`$select
[1] TRUE
$`_allData_allRun`$knots
NULL
$`_allData_allRun`$sp
NULL
$`_allData_allRun`$min.sp
NULL
$`_allData_allRun`$H
NULL
$`_allData_allRun`$gamma
[1] 1
$`_allData_allRun`$fit
[1] TRUE
$`_allData_allRun`$paraPen
NULL
$`_allData_allRun`$G
NULL
$`_allData_allRun`$in.out
NULL
$`_allData_allRun`$drop.unused.levels
[1] TRUE
$`_allData_allRun`$drop.intercept
NULL
$`_allData_allRun`$nei
NULL
$`_allData_allRun`$discrete
[1] FALSE
> print(tuned.ann)
$`_allData_allRun`
$`_allData_allRun`$size
[1] 8
$`_allData_allRun`$decay
[1] 0.001
$`_allData_allRun`$bag
[1] FALSE
$`_allData_allRun`$formula
bibronii ~ 1 + clip_bio2 + I(clip_bio2^2) + I(clip_bio2^3) +
clip_bio5 + I(clip_bio5^2) + I(clip_bio5^3) + clip_bio6 +
I(clip_bio6^2) + I(clip_bio6^3) + clip_bio15 + I(clip_bio15^2) +
I(clip_bio15^3)
<environment: 0x000001ecaa291dc8>
> bm_ModelingOptions
function (data.type, models = c("ANN", "CTA", "FDA", "GAM", "GBM",
"GLM", "MARS", "MAXENT", "MAXNET", "RF", "SRE", "XGBOOST"),
strategy, user.val = NULL, user.base = "bigboss", bm.format = NULL,
calib.lines = NULL)
{
.bm_cat("Build Modeling Options")
args <- .bm_ModelingOptions.check.args(data.type = data.type,
models = models, strategy = strategy, user.val = user.val,
user.base = user.base, bm.format = bm.format, calib.lines = calib.lines)
for (argi in names(args)) {
assign(x = argi, value = args[[argi]])
}
rm(args)
bm.opt <- foreach(model = models, .combine = "c") %do% {
if (length(grep("GAM", model)) == 1) {
tab.model <- ModelsTable[which(ModelsTable$model ==
"GAM" & ModelsTable$type == data.type & ModelsTable$package ==
strsplit(model, "[.]")[[1]][2] & ModelsTable$func ==
strsplit(model, "[.]")[[1]][3]), ]
model = "GAM"
}
else {
tab.model <- ModelsTable[which(ModelsTable$model ==
model & ModelsTable$type == data.type), ]
}
if (nrow(tab.model) > 0) {
BOD.list <- foreach(ii = 1:nrow(tab.model)) %do%
{
name_model <- paste0(model, ".", data.type,
".", tab.model$package[ii], ".", tab.model$func[ii])
val.ii <- NULL
if (strategy == "user.defined") {
val.ii <- user.val[[name_model]]
}
BOD <- BIOMOD.options.dataset(mod = model,
typ = data.type, pkg = tab.model$package[ii],
fun = tab.model$func[ii], strategy = strategy,
user.val = val.ii, user.base = user.base,
tuning.fun = tab.model$train[ii], bm.format = bm.format,
calib.lines = calib.lines)
for (xx in 1:length(BOD@args.values)) {
if ("..." %in% names(BOD@args.values[[xx]])) {
BOD@args.values[[xx]][["..."]] <- NULL
}
}
return(BOD)
}
names(BOD.list) <- paste0(model, ".", data.type,
".", tab.model$package, ".", tab.model$func)
return(BOD.list)
}
}
bm.options <- new("BIOMOD.models.options", models = names(bm.opt),
options = bm.opt)
cat("\n")
.bm_cat("Done")
return(bm.options)
}
<bytecode: 0x000001ec0a1490f0>
<environment: namespace:biomod2>
Natalia
Hi Natalia,
The problem comes from opt.sre
and user.rf
where the "_" before allData_allRun
is necessary.
opt.sre <- list('_allData_allRun' = list(quant = 0.1))
user.rf <- list("_allData_allRun" = list(ntree = 1000, sampsize = 130, replace =TRUE))
Hope it gonna be fine now 😄 Hélène
Yes! Thanks a lot!
Context and question I am trying to know if I am setting my models in the right way so I calculated the mean scores, but RF gives too much perfection, which is suspicious. SRE directly gives out limits so definitively I am doing something wrong but I do not have any errors only warnings.
Code used bibronii_format <- BIOMOD_FormatingData( resp.name = "bibronii", resp.var = bibronii$presence, resp.xy = bibronii[, c("lon", "lat")], expl.var = stk_current, filter.raster = TRUE )
cv.k <- bm_CrossValidation(bm.format = bibronii_format, strategy = 'kfold', nb.rep = 2, k = 3)
opt.d <- bm_ModelingOptions(data.type = 'binary', models = "RF", strategy = 'tuned', bm.format = bibronii_format, calib.lines = cv.k)
tuned.rf <- bm_Tuning(model = 'RF', tuning.fun = 'rf', do.formula = TRUE, bm.options = opt.d@options$RF.binary.randomForest.randomForest, bm.format = bibronii_format)
opt.d2 <- bm_ModelingOptions(data.type = 'binary', models = "GAM", strategy = 'tuned', bm.format = bibronii_format, calib.lines = cv.k)
tuned.gam <- bm_Tuning(model = 'GAM', tuning.fun = 'gam', do.formula = TRUE, do.stepAIC = TRUE, bm.options = opt.d2@options$GAM.binary.mgcv.gam, bm.format = bibronii_format)
sre.090 <- bm_SRE(resp.var = bibronii$presence, expl.var = valores_current[4:7], quant = 0.1)
opt.d3 <- bm_ModelingOptions(data.type = 'binary', models = "ANN", strategy = 'tuned', bm.format = bibronii_format, calib.lines = cv.k) registerDoParallel() tuned.ann <- bm_Tuning(model = "ANN", tuning.fun = "avNNet", do.formula = T, bm.options = opt.d3@options$ANN.binary.nnet.nnet, bm.format = bibronii_format )
bibronii_opt <- bm_ModelingOptions( data.type = "binary", models=c("SRE","RF","GAM","ANN"), calib.lines = cv.k, bm.format = bibronii_format, strategy = "user.defined", user.base = "bigboss", user.val <- list(sre.090, tuned.rf, tuned.gam, tuned.ann )
)
bibronii_modelo <- BIOMOD_Modeling( bm.format = bibronii_format, modeling.id = "solo_bibronii", models = c("SRE", "RF", "GAM", "ANN"), CV.strategy = "random", CV.nb.rep = 5, CV.perc = 0.8, var.import = 3, metric.eval = c("TSS", "BIAS") )
bm_PlotEvalMean( bm.out = bibronii_modelo, dataset = "calibration", xlim = c(0,1), ylim = c(0,1))
These are the scores values results and the warnings
name mean1 mean2 sd1 sd2 1 ANN 0.9383333 0.7333333 0.054720502 0.12700525 2 GAM 0.9983333 0.6001667 0.004082483 0.08983411 3 RF 1.0000000 1.0000000 0.000000000 0.00000000 4 SRE -Inf -Inf NaN NaN
$plot
Warning messages: 1: Removed 1 row containing missing values or values outside the scale range (
geom_errorbarh()
). 2: Removed 1 row containing missing values or values outside the scale range (geom_errorbarh()
).-=-=-=-=-=-=-=-=-=-=-=-=-=-=-= BIOMOD.formated.data -=-=-=-=-=-=-=-=-=-=-=-=-=-=-=
dir.name = .
sp.name = bibronii
clip_bio2 clip_bio5 clip_bio6 clip_bio15
Min. : 6.264 Min. :12.80 Min. :-7.868 Min. : 9.823
1st Qu.:10.121 1st Qu.:19.79 1st Qu.:-2.724 1st Qu.:23.825
Median :10.866 Median :21.89 Median :-1.202 Median :27.796
Mean :10.559 Mean :21.24 Mean :-1.429 Mean :28.420
3rd Qu.:11.209 3rd Qu.:23.13 3rd Qu.: 0.417 3rd Qu.:31.370
Max. :13.662 Max. :25.89 Max. : 2.588 Max. :59.618
-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=
show(myExpl) # if using an environment raster class : RasterStack dimensions : 335, 291, 97485, 4 (nrow, ncol, ncell, nlayers) resolution : 0.04166667, 0.04166667 (x, y) extent : -75.70833, -63.58333, -55.95833, -42 (xmin, xmax, ymin, ymax) crs : +proj=longlat +datum=WGS84 +no_defs names : clip_bio2, clip_bio5, clip_bio6, clip_bio15 min values : 3.751515, 0.800000, -14.472000, 6.956398 max values : 14.04067, 29.30000, 3.70000, 67.16410
show(myBiomodModelOut) -=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-= BIOMOD.models.out -=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=
Modeling folder : .
Species modeled : bibronii
Modeling id : solo_bibronii
Considered variables : clip_bio2 clip_bio5 clip_bio6 clip_bio15
Computed Models : bibronii_allData_RUN1_SRE bibronii_allData_RUN1_RF bibronii_allData_RUN1_GAM bibronii_allData_RUN2_SRE bibronii_allData_RUN2_RF bibronii_allData_RUN2_GAM bibronii_allData_RUN3_SRE bibronii_allData_RUN3_RF bibronii_allData_RUN3_GAM bibronii_allData_RUN3_ANN bibronii_allData_RUN4_SRE bibronii_allData_RUN4_RF bibronii_allData_RUN4_GAM bibronii_allData_RUN4_ANN bibronii_allData_RUN5_SRE bibronii_allData_RUN5_RF bibronii_allData_RUN5_GAM bibronii_allData_allRun_SRE bibronii_allData_allRun_RF bibronii_allData_allRun_GAM bibronii_allData_allRun_ANN
Failed Models : bibronii_allData_RUN1_ANN bibronii_allData_RUN2_ANN bibronii_allData_RUN5_ANN
-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=
Other outputs (optional) If you have questions about a figure or maps, you can add a screenshot here.
Environment Information Please paste the output of
sessionInfo()
in your current R session below. R version 4.3.3 (2024-02-29 ucrt) Platform: x86_64-w64-mingw32/x64 (64-bit) Running under: Windows 10 x64 (build 19045)Matrix products: default
locale: [1] LC_COLLATE=English_United States.utf8 LC_CTYPE=English_United States.utf8
[3] LC_MONETARY=English_United States.utf8 LC_NUMERIC=C
[5] LC_TIME=English_United States.utf8
time zone: America/Buenos_Aires tzcode source: internal
attached base packages: [1] parallel splines stats graphics grDevices utils datasets methods
[9] base
other attached packages: [1] caret_6.0-94 ggplot2_3.5.1 snowfall_1.84-6.3
[4] snow_0.4-4 biomod2_4.2-5-2 xgboost_1.7.8.1
[7] randomForest_4.7-1.2 maxnet_0.1.4 earth_5.3.3
[10] plotmo_3.6.4 plotrix_3.8-4 Formula_1.2-5
[13] gbm_2.2.2 lattice_0.22-5 doParallel_1.0.17
[16] iterators_1.0.14 ggtext_0.1.2 tidyterra_0.6.1
[19] mgcv_1.9-1 nlme_3.1-166 gam_1.22-5
[22] foreach_1.5.2 raster_3.6-26 sp_2.1-4
[25] mda_0.5-4 class_7.3-22 rpart_4.1.23
[28] nnet_7.3-19 terra_1.7-78
loaded via a namespace (and not attached): [1] DBI_1.2.2 deldir_2.0-4 pROC_1.18.5
[4] rlang_1.1.3 magrittr_2.0.3 e1071_1.7-13
[7] compiler_4.3.3 png_0.1-8 vctrs_0.6.5
[10] reshape2_1.4.4 stringr_1.5.1 pkgconfig_2.0.3
[13] labeling_0.4.3 utf8_1.2.4 prodlim_2024.06.25
[16] purrr_1.0.2 jsonlite_1.8.8 PresenceAbsence_1.1.11 [19] recipes_1.1.0 reshape_0.8.9 jpeg_0.1-10
[22] R6_2.5.1 stringi_1.8.4 RColorBrewer_1.1-3
[25] parallelly_1.38.0 lubridate_1.9.3 Rcpp_1.0.12
[28] future.apply_1.11.2 zoo_1.8-12 Matrix_1.6-1.1
[31] timechange_0.3.0 tidyselect_1.2.1 rstudioapi_0.16.0
[34] abind_1.4-5 timeDate_4032.109 codetools_0.2-19
[37] listenv_0.9.1 tibble_3.2.1 plyr_1.8.9
[40] withr_3.0.1 future_1.34.0 survival_3.6-4
[43] sf_1.0-16 units_0.8-5 proxy_0.4-27
[46] xml2_1.3.6 pillar_1.9.0 KernSmooth_2.23-22
[49] stats4_4.3.3 generics_0.1.3 munsell_0.5.1
[52] scales_1.3.0 globals_0.16.3 glue_1.7.0
[55] tools_4.3.3 interp_1.1-6 hexbin_1.28.3
[58] data.table_1.15.4 ModelMetrics_1.2.2.2 gower_1.0.1
[61] grid_4.3.3 tidyr_1.3.1 latticeExtra_0.6-30
[64] ipred_0.9-15 colorspace_2.1-1 cli_3.6.2
[67] fansi_1.0.6 viridisLite_0.4.2 lava_1.8.0
[70] rasterVis_0.51.6 dplyr_1.1.4 gtable_0.3.5
[73] digest_0.6.35 classInt_0.4-10 farver_2.1.2
[76] lifecycle_1.0.4 dismo_1.3-14 hardhat_1.4.0
[79] gridtext_0.1.5 MASS_7.3-60.0.1