Error in the predict function

arzevedo commented 3 years ago

Hi Matheus, I have a problem with the package could you help me with this error?

library(rmachines)
mod_classification <- rmachines::random_machines(formula=EDpDM_isKeystone~.,
                                    train=functional_analysis,
                                    test=functional_analysis,
                                    boots_size=100, 
                                    cost=1,
                                    seed.bootstrap=2020, 
                                    automatic_tuning=FALSE,
                                    poly_scale=1, 
                                    gamma_rbf=1,
                                    gamma_lap=1,
                                    degree=2,
                                    offset=0)
#> 1: In .local(x, ...) : Variable(s) `' constant. Cannot scale data.

pred <- rmachines::predict(mod_classification,newdata=functional_analysis)
#> Error in UseMethod("predict") :  método não aplicável para 'predict' aplicado a um objeto de classe "rm_model"

pred <- rmachines::predict.rm_model(mod_classification,newdata=functional_analysis)
#> Error in .local(object, ...) : test vector does not match model !

^{Created on 2020-12-02 by the reprex package (v0.3.0)}

I tried both predict.lm model and predict, but neither run. Thanks in advance.

MateusMaiaDS commented 3 years ago

Hi Arthur,

For the first instance I suggest to verify if there is any constant variable into the data, e.g: one column which have the same value for every instance.

For the other cases, make sure that both test and training data have the same structure and covariate names.

If any of these work, could you send the header from the training and test set?

arzevedo commented 3 years ago

I took your first guidance. There were two columns only with zeros so i filter them out. That was enough to make predict.rm_model work.

library(rmachines)
library(tidyverse)
library(tidymodels)

mydf <- recipe(EDpDM_isKeystone ~ .,
               data = read_csv("C:/Users/arthu/OneDrive/Documentos/function_analysis/model_input.csv") %>% 
                 mutate(EDpDM_isKeystone = factor(EDpDM_isKeystone))
) %>%
  update_role(Taxon, new_role = "Id") %>%
  step_corr(all_predictors(), threshold = 0.8,- all_nominal()) %>% 
  prep() %>% juice()

df_split <- initial_split(mydf, strata = EDpDM_isKeystone, prop = .75)
df_train <- training(df_split)
df_test <- testing(df_split)

head(df_train)
#> # A tibble: 6 x 92
#>   Taxon Ecosystem x1_1_1_trichlor~ x1_and_2_methyl~ x2_4_dichlorobe~
#>   <fct> <fct>                <dbl>            <dbl>            <dbl>
#> 1 Acid~ sediment                 0                2                2
#> 2 Acti~ sediment                 0                0               12
#> 3 Aqui~ sediment                 0                1                0
#> 4 Arma~ sediment                 0                1                2
#> 5 Baln~ sediment                 2                4                3
#> 6 Cald~ sediment                 0                2                2
#> # ... with 87 more variables: acridone_alkaloid_biosynthesis <dbl>,
#> #   amino_sugar_and_nucleotide_sugar_metabolism <dbl>,
#> #   aminoacyl_t_rna_biosynthesis <dbl>, arachidonic_acid_metabolism <dbl>,
#> #   ascorbate_and_aldarate_metabolism <dbl>, atrazine_degradation <dbl>,
#> #   beta_lactam_resistance <dbl>, betalain_biosynthesis <dbl>,
#> #   biosynthesis_of_ansamycins <dbl>...

head(df_test)
#> # A tibble: 6 x 92
#>   Taxon Ecosystem x1_1_1_trichlor~ x1_and_2_methyl~ x2_4_dichlorobe~
#>   <fct> <fct>                <dbl>            <dbl>            <dbl>
#> 1 Cand~ sediment                 0                1                0
#> 2 Cand~ sediment                 0                0                2
#> 3 Cand~ sediment                 0                1                0
#> 4 Cand~ sediment                 0                1                0
#> 5 Cand~ sediment                 0                1                2
#> 6 Cand~ sediment                 0                1                3
#> # ... with 87 more variables: acridone_alkaloid_biosynthesis <dbl>,
#> #   amino_sugar_and_nucleotide_sugar_metabolism <dbl>,
#> #   aminoacyl_t_rna_biosynthesis <dbl>, arachidonic_acid_metabolism <dbl>,
#> #   ascorbate_and_aldarate_metabolism <dbl>, atrazine_degradation <dbl>,
#> #   beta_lactam_resistance <dbl>, betalain_biosynthesis <dbl>,
#> #   biosynthesis_of_ansamycins <dbl>....

mod_classification <- rmachines::random_machines(formula=EDpDM_isKeystone~.,
                                                 train=df_train,
                                                 test=df_test,
                                                 boots_size=100, 
                                                 cost=1,
                                                 seed.bootstrap=2020, 
                                                 automatic_tuning=FALSE,
                                                 poly_scale=1, 
                                                 gamma_rbf=1,
                                                 gamma_lap=1,
                                                 degree=2,
                                                 offset=0)

env_test <- mydf %>% filter(Ecosystem == "saline water")

head(env_test)
#> # A tibble: 6 x 92
#>   Taxon Ecosystem x1_1_1_trichlor~ x1_and_2_methyl~ x2_4_dichlorobe~
#>   <fct> <fct>                <dbl>            <dbl>            <dbl>
#> 1 Acti~ saline w~                0                0               12
#> 2 Aqui~ saline w~                0                1                0
#> 3 Bact~ saline w~                0                0                2
#> 4 Baln~ saline w~                2                4                3
#> 5 Cand~ saline w~                0                1                0
#> 6 Cand~ saline w~                1                0                0
#> # ... with 87 more variables: acridone_alkaloid_biosynthesis <dbl>,
#> #   amino_sugar_and_nucleotide_sugar_metabolism <dbl>,
#> #   aminoacyl_t_rna_biosynthesis <dbl>, arachidonic_acid_metabolism <dbl>,
#> #   ascorbate_and_aldarate_metabolism <dbl>, atrazine_degradation <dbl>,
#> #   beta_lactam_resistance <dbl>, betalain_biosynthesis <dbl>,
#> #   biosynthesis_of_ansamycins <dbl> .....

pred <- rmachines::predict(mod_classification,newdata=mydf)
#> Error in UseMethod("predict"): método não aplicável para 'predict' aplicado a um objeto de classe "rm_model"

pred <- rmachines::predict.rm_model(mod_classification,newdata=env_test)

env_test %>% 
  mutate(PRED = pred) %>% 
  count(EDpDM_isKeystone, PRED) %>% 
  group_by(EDpDM_isKeystone) %>% 
  mutate(freq = n/sum(n))
#> # A tibble: 4 x 4
#> # Groups:   EDpDM_isKeystone [2]
#>   EDpDM_isKeystone PRED      n  freq
#>   <fct>            <fct> <int> <dbl>
#> 1 0                0        55 0.509
#> 2 0                1        53 0.491
#> 3 1                0        10 0.238
#> 4 1                1        32 0.762

^{Created on 2020-12-02 by the reprex package (v0.3.0)}

I still get the error in predict, but i suppose that everything is running smoothly.

MateusMaiaDS commented 3 years ago

Hi @arzevedo , Can you show me the class of the object mod_classification? Do not use rmachines::predict, just load the library and use the prediction function normally, if it still not working please remove and reinstall the package.

arzevedo commented 3 years ago

library(rmachines)
library(tidyverse)
library(tidymodels)

mydf <- recipe(EDpDM_isKeystone ~ .,
               data = read_csv("C:/Users/arthu/OneDrive/Documentos/function_analysis/model_input.csv") %>% 
                 mutate(EDpDM_isKeystone = factor(EDpDM_isKeystone))
) %>%
  update_role(Taxon, new_role = "Id") %>%
  step_corr(all_predictors(), threshold = 0.8,- all_nominal()) %>% 
  prep() %>% juice()

df_split <- initial_split(mydf, strata = EDpDM_isKeystone, prop = .75)
df_train <- training(df_split)
df_test <- testing(df_split)

mod_classification <- rmachines::random_machines(formula=EDpDM_isKeystone~.,
                                                 train=df_train,
                                                 test=df_test,
                                                 boots_size=100, 
                                                 cost=1,
                                                 seed.bootstrap=2020, 
                                                 automatic_tuning=FALSE,
                                                 poly_scale=1, 
                                                 gamma_rbf=1,
                                                 gamma_lap=1,
                                                 degree=2,
                                                 offset=0)

class(mod_classification)
#> [1] "rm_model"

env_test <- mydf %>% filter(Ecosystem == "saline water")

pred <- predict(mod_classification,newdata=env_test)
#> Error in UseMethod("predict"): método não aplicável para 'predict' aplicado a um objeto de classe "rm_model"

^{Created on 2020-12-03 by the reprex package (v0.3.0)}

still not working. I'll remove and reinstall the package.

MateusMaiaDS / rmachines

Error in the predict function #1