Closed cgoo4 closed 2 years ago
Hello @cgoo4 👋
I'm not able to reproduce, using the most recent version of tidyclust. Can you update with remotes::install_github("emilhvitfeldt/tidyclust")
and try again?
library(tidymodels)
library(tidyclust)
#>
#> Attaching package: 'tidyclust'
#> The following object is masked from 'package:parsnip':
#>
#> prepare_data
data <- tribble(
~var1, ~var2,
-10.7309753827869, 7.31314782728295,
0.158313453988934, -1.61716134057181,
2.56976828347436, -0.0304718731434479,
2.52510547176991, 2.25837680812956,
-0.545994998160519, -1.65459969854165,
4.24718480285034, 2.63196351710467,
-2.58277023154306, 0.071240121563463,
1.76791313026014, -0.0820188617066647,
0.700250002891148, -0.28659873913792,
2.05609909308338, -0.944805582445498,
0.341405837432006, -1.32977660565642,
-2.50062436356793, -1.90326151166205,
-1.0388232093446, -0.36389143468345,
-1.00255850689283, -1.6047655508424,
1.63335997713595, 1.3094039086307,
2.8965669433778, 2.53970798590398,
1.9755753941663, 1.05180385078969,
0.949450225797159, 0.0715029785385012,
-3.11293609366517, -1.20094733897835,
0.433470080004349, 1.44883471157841,
1.80155162083184, 1.74231103529767,
-2.0330697607949, -2.238240115786,
0.226699387711105, -1.56773486777401,
0.705202066774586, 1.41309135840208,
-2.0229531809923, -3.80825276177795,
1.54212334904791, -0.33715090285275,
2.98458548773627, 2.88958633008886,
-0.551832844262862, -2.05139067509532,
2.82225740379044, 2.21534428308637,
-2.74329702499752, -3.54358226626749,
-0.468043631321413, -1.56406035159058,
0.466410395768423, -0.926812424657721,
-3.46941317956229, 0.0992081867745776
)
kmeans_spec <- k_means(num_clusters = tune()) |>
set_engine("stats", algorithm = "Lloyd")
kmeans_rec <- recipe(~ var1 + var2, data = data)
kmeans_wflow <- workflow(kmeans_rec, kmeans_spec)
kmeans_cv <- vfold_cv(data, v = 5)
kmeans_grid <- grid_regular(num_clusters(), levels = 6)
kmeans_res <- tune_cluster(
kmeans_wflow,
resamples = kmeans_cv,
grid = kmeans_grid,
control = control_grid(save_pred = TRUE, extract = identity),
metrics = cluster_metric_set(tot_wss, tot_sse, sse_ratio)
)
collect_notes(kmeans_res)
#> # A tibble: 0 × 4
#> # … with 4 variables: id <chr>, location <chr>, type <chr>, note <chr>
Created on 2022-08-30 by the reprex package (v2.0.1)
Hi @EmilHvitfeldt - I've updated again with remotes::install_github("emilhvitfeldt/tidyclust")
and re-run, but get the same outcome (includes session info):
library(tidyverse)
library(tidymodels)
library(tidyclust)
data <- tribble(
~var1, ~var2,
-10.7309753827869, 7.31314782728295,
0.158313453988934, -1.61716134057181,
2.56976828347436, -0.0304718731434479,
2.52510547176991, 2.25837680812956,
-0.545994998160519, -1.65459969854165,
4.24718480285034, 2.63196351710467,
-2.58277023154306, 0.071240121563463,
1.76791313026014, -0.0820188617066647,
0.700250002891148, -0.28659873913792,
2.05609909308338, -0.944805582445498,
0.341405837432006, -1.32977660565642,
-2.50062436356793, -1.90326151166205,
-1.0388232093446, -0.36389143468345,
-1.00255850689283, -1.6047655508424,
1.63335997713595, 1.3094039086307,
2.8965669433778, 2.53970798590398,
1.9755753941663, 1.05180385078969,
0.949450225797159, 0.0715029785385012,
-3.11293609366517, -1.20094733897835,
0.433470080004349, 1.44883471157841,
1.80155162083184, 1.74231103529767,
-2.0330697607949, -2.238240115786,
0.226699387711105, -1.56773486777401,
0.705202066774586, 1.41309135840208,
-2.0229531809923, -3.80825276177795,
1.54212334904791, -0.33715090285275,
2.98458548773627, 2.88958633008886,
-0.551832844262862, -2.05139067509532,
2.82225740379044, 2.21534428308637,
-2.74329702499752, -3.54358226626749,
-0.468043631321413, -1.56406035159058,
0.466410395768423, -0.926812424657721,
-3.46941317956229, 0.0992081867745776
)
kmeans_spec <- k_means(num_clusters = tune()) |>
set_engine("stats", algorithm = "Lloyd")
kmeans_rec <- recipe(~ var1 + var2, data = data)
kmeans_wflow <- workflow(kmeans_rec, kmeans_spec)
kmeans_cv <- vfold_cv(data, v = 5)
kmeans_grid <- grid_regular(num_clusters(), levels = 6)
kmeans_res <- tune_cluster(
kmeans_wflow,
resamples = kmeans_cv,
grid = kmeans_grid,
control = control_grid(save_pred = TRUE, extract = identity),
metrics = cluster_metric_set(tot_wss, tot_sse, sse_ratio)
)
#> Warning: All models failed. See the `.notes` column.
collect_notes(kmeans_res)
#> # A tibble: 0 × 2
#> # … with 2 variables: id <chr>, .notes <???>
Created on 2022-08-31 with reprex v2.0.2
Hi @EmilHvitfeldt, I've run into the same issue. I originally encountered the error working with my own [proprietary] data. As part of my troubleshooting, I created a fresh R project, installed the most recent version of tidyclust, copied the tuning and metrics vignette verbatim, and the error still appeared.
I hope this is helpful for additional troubleshooting (e.g. if some supporting package version discrepancy is at play here). Thanks.
Also, ditto the praise for tidyclust. Great package and great talk at conf! 🏆
library(parsnip)
library(workflows)
library(tidyclust)
#>
#> Attaching package: 'tidyclust'
#> The following object is masked from 'package:parsnip':
#>
#> prepare_data
library(tidyverse)
library(tidymodels)
data("penguins", package = "modeldata")
penguins <- penguins %>%
drop_na()
penguins_cv <- vfold_cv(penguins, v = 5)
kmeans_spec <- k_means(num_clusters = tune())
penguins_rec <- recipe(~ bill_length_mm + bill_depth_mm,
data = penguins)
kmeans_wflow <- workflow(penguins_rec, kmeans_spec)
clust_num_grid <- grid_regular(num_clusters(),
levels = 10)
clust_num_grid
#> # A tibble: 10 × 1
#> num_clusters
#> <int>
#> 1 1
#> 2 2
#> 3 3
#> 4 4
#> 5 5
#> 6 6
#> 7 7
#> 8 8
#> 9 9
#> 10 10
res <- tune_cluster(
kmeans_wflow,
resamples = penguins_cv,
grid = clust_num_grid,
control = control_grid(save_pred = TRUE, extract = identity),
metrics = cluster_metric_set(tot_wss, tot_sse, sse_ratio)
)
#> Warning: All models failed. See the `.notes` column.
res
#> # Tuning results
#> # 5-fold cross-validation
#> # A tibble: 5 × 6
#> splits id .metrics .notes .extracts .predictions
#> <list> <chr> <list> <list> <list> <list>
#> 1 <split [266/67]> Fold1 <NULL> <NULL> <NULL> <NULL>
#> 2 <split [266/67]> Fold2 <NULL> <NULL> <NULL> <NULL>
#> 3 <split [266/67]> Fold3 <NULL> <NULL> <NULL> <NULL>
#> 4 <split [267/66]> Fold4 <NULL> <NULL> <NULL> <NULL>
#> 5 <split [267/66]> Fold5 <NULL> <NULL> <NULL> <NULL>
res_metrics <- res %>% collect_metrics()
#> Error in `estimate_tune_results()`:
#> ! All of the models failed. See the .notes column.
res_metrics
#> Error in eval(expr, envir, enclos): object 'res_metrics' not found
Created on 2022-08-31 by the reprex package (v2.0.1)
Hello @cgoo4 & @stevensmallberg! Can one of you run the following code exactly and report back? I'm trying to narrow down the issue
library(tidyclust)
library(recipes)
library(workflows)
data("penguins", package = "modeldata")
penguins <- penguins %>%
tidyr::drop_na()
kmeans_spec <- k_means(num_clusters = 3)
penguins_rec <- recipe(~ bill_length_mm + bill_depth_mm,
data = penguins)
kmeans_wflow <- workflow(penguins_rec, kmeans_spec)
fit(kmeans_wflow, data = penguins)
@EmilHvitfeldt Works just fine when the num_clusters
argument of kmeans()
is supplied explicitly.
I should've said originally — the workflow laid out in the kmeans vignette worked for me, including on my own data, without any errors.
library(tidyclust)
library(recipes)
#> Loading required package: dplyr
#>
#> Attaching package: 'dplyr'
#> The following objects are masked from 'package:stats':
#>
#> filter, lag
#> The following objects are masked from 'package:base':
#>
#> intersect, setdiff, setequal, union
#>
#> Attaching package: 'recipes'
#> The following object is masked from 'package:stats':
#>
#> step
library(workflows)
data("penguins", package = "modeldata")
penguins <- penguins %>%
tidyr::drop_na()
kmeans_spec <- k_means(num_clusters = 3)
penguins_rec <- recipe(~ bill_length_mm + bill_depth_mm,
data = penguins)
kmeans_wflow <- workflow(penguins_rec, kmeans_spec)
fit(kmeans_wflow, data = penguins)
#> ══ Workflow [trained] ══════════════════════════════════════════════════════════
#> Preprocessor: Recipe
#> Model: k_means()
#>
#> ── Preprocessor ────────────────────────────────────────────────────────────────
#> 0 Recipe Steps
#>
#> ── Model ───────────────────────────────────────────────────────────────────────
#> K-means clustering with 3 clusters of sizes 112, 136, 85
#>
#> Cluster means:
#> bill_length_mm bill_depth_mm
#> 1 45.50982 15.68304
#> 2 38.42426 18.27794
#> 3 50.90353 17.33647
#>
#> Clustering vector:
#> [1] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
#> [38] 2 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 1 2 1 2 2 2 2
#> [75] 2 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 1 2 2 2 2 2 2 2 2 2 1 2 1 2 2 2 1 2
#> [112] 2 2 2 2 2 2 2 2 2 2 2 2 1 2 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 1 3
#> [149] 1 3 1 1 1 1 1 1 1 3 1 1 1 3 1 3 1 3 3 1 1 1 1 1 1 1 3 1 1 1 3 3 3 1 1 1 3
#> [186] 1 3 1 3 3 1 1 3 1 1 1 1 1 3 1 1 1 1 1 3 1 1 1 3 1 3 3 1 3 1 1 1 1 1 3 1 3
#> [223] 1 1 3 3 1 3 1 3 1 3 1 3 1 3 1 3 1 3 3 1 1 3 1 3 1 3 1 1 3 1 1 3 3 1 3 1 3
#> [260] 3 1 1 3 1 3 1 3 3 1 3 1 1 3 1 3 1 3 1 3 1 3 3 3 1 3 1 3 1 3 1 3 3 3 1 3 2
#> [297] 3 1 3 3 1 1 3 1 3 3 1 3 1 3 3 3 3 3 3 1 3 1 3 1 3 1 3 3 1 3 1 1 3 1 3 3 3
#>
#> Within cluster sum of squares by cluster:
#> [1] 742.0970 904.9838 617.9859
#> (between_SS / total_SS = 79.8 %)
#>
#> Available components:
#>
#> [1] "cluster" "centers" "totss" "withinss" "tot.withinss"
#> [6] "betweenss" "size" "iter" "ifault"
Created on 2022-08-31 by the reprex package (v2.0.1)
@EmilHvitfeldt Ditto per @stevensmallberg for me. The k-means vignette (with specified num_clusters()
) works. The Tuning Cluster Models vignette returns "All models failed." I also tried the dev version of tune for tune::control_grid()
with the same outcome.
If, instead of using a workflow, I include the spec and preprocessor in tune_cluster()
and use control_cluster()
, then I get a different error per below:
library(tidyclust)
library(tidyverse)
library(tidymodels)
data("penguins", package = "modeldata")
penguins <- penguins %>%
drop_na()
penguins_cv <- vfold_cv(penguins, v = 5)
kmeans_spec <- k_means(num_clusters = tune())
penguins_rec <- recipe(~ bill_length_mm + bill_depth_mm,
data = penguins)
# kmeans_wflow <- workflow(penguins_rec, kmeans_spec)
clust_num_grid <- grid_regular(num_clusters(),
levels = 10)
clust_num_grid
#> # A tibble: 10 × 1
#> num_clusters
#> <int>
#> 1 1
#> 2 2
#> 3 3
#> 4 4
#> 5 5
#> 6 6
#> 7 7
#> 8 8
#> 9 9
#> 10 10
res <- tune_cluster(
kmeans_spec,
preprocessor = penguins_rec,
resamples = penguins_cv,
grid = clust_num_grid,
control = control_cluster(),
metrics = cluster_metric_set(tot_wss, tot_sse, sse_ratio)
)
#> Error in allow && is_par: invalid 'x' type in 'x && y'
res
#> Error in eval(expr, envir, enclos): object 'res' not found
Created on 2022-09-01 with reprex v2.0.2
@stevensmallberg helped me figure this one out and it turns out it was an installation issue. To fix this problem, please install the {flexclust} and {Rfast} packages. Using the most recent version of tidyclust by running devtools::install_github("EmilHvitfeldt/tidyclust")
should force this change
This issue has been automatically locked. If you believe you have found a related problem, please file a new issue (with a reprex: https://reprex.tidyverse.org) and link to this issue.
Really great to see clustering in tidymodels!
Tuning the following example, I'm not getting anything in
.notes
?Created on 2022-08-29 with reprex v2.0.2
Session info
``` r sessioninfo::session_info() #> ─ Session info ─────────────────────────────────────────────────────────────── #> setting value #> version R version 4.2.1 (2022-06-23) #> os macOS Big Sur ... 10.16 #> system x86_64, darwin17.0 #> ui X11 #> language (EN) #> collate en_GB.UTF-8 #> ctype en_GB.UTF-8 #> tz Europe/London #> date 2022-08-29 #> pandoc 2.18 @ /Applications/RStudio.app/Contents/MacOS/quarto/bin/tools/ (via rmarkdown) #> #> ─ Packages ─────────────────────────────────────────────────────────────────── #> package * version date (UTC) lib source #> assertthat 0.2.1 2019-03-21 [1] CRAN (R 4.2.0) #> backports 1.4.1 2021-12-13 [1] CRAN (R 4.2.0) #> broom * 1.0.0 2022-07-01 [1] CRAN (R 4.2.0) #> cellranger 1.1.0 2016-07-27 [1] CRAN (R 4.2.0) #> class 7.3-20 2022-01-16 [1] CRAN (R 4.2.1) #> cli 3.3.0 2022-04-25 [1] CRAN (R 4.2.0) #> codetools 0.2-18 2020-11-04 [1] CRAN (R 4.2.1) #> colorspace 2.0-3 2022-02-21 [1] CRAN (R 4.2.0) #> crayon 1.5.1 2022-03-26 [1] CRAN (R 4.2.0) #> DBI 1.1.3 2022-06-18 [1] CRAN (R 4.2.0) #> dbplyr 2.2.1 2022-06-27 [1] CRAN (R 4.2.0) #> dials * 1.0.0 2022-06-14 [1] CRAN (R 4.2.0) #> DiceDesign 1.9 2021-02-13 [1] CRAN (R 4.2.0) #> digest 0.6.29 2021-12-01 [1] CRAN (R 4.2.0) #> dplyr * 1.0.9 2022-04-28 [1] CRAN (R 4.2.0) #> ellipsis 0.3.2 2021-04-29 [1] CRAN (R 4.2.0) #> evaluate 0.16 2022-08-09 [1] CRAN (R 4.2.0) #> fansi 1.0.3 2022-03-24 [1] CRAN (R 4.2.0) #> fastmap 1.1.0 2021-01-25 [1] CRAN (R 4.2.0) #> flexclust 1.4-1 2022-04-08 [1] CRAN (R 4.2.0) #> forcats * 0.5.2 2022-08-19 [1] CRAN (R 4.2.1) #> foreach 1.5.2 2022-02-02 [1] CRAN (R 4.2.0) #> fs 1.5.2 2021-12-08 [1] CRAN (R 4.2.0) #> furrr 0.3.1 2022-08-15 [1] CRAN (R 4.2.0) #> future 1.27.0 2022-07-22 [1] CRAN (R 4.2.1) #> future.apply 1.9.0 2022-04-25 [1] CRAN (R 4.2.0) #> gargle 1.2.0 2021-07-02 [1] CRAN (R 4.2.0) #> generics 0.1.3 2022-07-05 [1] CRAN (R 4.2.1) #> ggplot2 * 3.3.6 2022-05-03 [1] CRAN (R 4.2.0) #> globals 0.16.0 2022-08-05 [1] CRAN (R 4.2.1) #> glue 1.6.2 2022-02-24 [1] CRAN (R 4.2.0) #> googledrive 2.0.0 2021-07-08 [1] CRAN (R 4.2.0) #> googlesheets4 1.0.1 2022-08-13 [1] CRAN (R 4.2.1) #> gower 1.0.0 2022-02-03 [1] CRAN (R 4.2.0) #> GPfit 1.0-8 2019-02-08 [1] CRAN (R 4.2.0) #> gtable 0.3.0 2019-03-25 [1] CRAN (R 4.2.0) #> hardhat 1.2.0 2022-06-30 [1] CRAN (R 4.2.0) #> haven 2.5.1 2022-08-22 [1] CRAN (R 4.2.1) #> highr 0.9 2021-04-16 [1] CRAN (R 4.2.0) #> hms 1.1.2 2022-08-19 [1] CRAN (R 4.2.1) #> htmltools 0.5.3 2022-07-18 [1] CRAN (R 4.2.1) #> httr 1.4.4 2022-08-17 [1] CRAN (R 4.2.0) #> infer * 1.0.3 2022-08-22 [1] CRAN (R 4.2.1) #> ipred 0.9-13 2022-06-02 [1] CRAN (R 4.2.0) #> iterators 1.0.14 2022-02-05 [1] CRAN (R 4.2.0) #> jsonlite 1.8.0 2022-02-22 [1] CRAN (R 4.2.0) #> knitr 1.40 2022-08-24 [1] CRAN (R 4.2.1) #> lattice 0.20-45 2021-09-22 [1] CRAN (R 4.2.1) #> lava 1.6.10 2021-09-02 [1] CRAN (R 4.2.0) #> lhs 1.1.5 2022-03-22 [1] CRAN (R 4.2.0) #> lifecycle 1.0.1 2021-09-24 [1] CRAN (R 4.2.0) #> listenv 0.8.0 2019-12-05 [1] CRAN (R 4.2.0) #> lubridate 1.8.0.9000 2022-06-09 [1] Github (tidyverse/lubridate@0bb49b2) #> magrittr 2.0.3 2022-03-30 [1] CRAN (R 4.2.0) #> MASS 7.3-58.1 2022-08-03 [1] CRAN (R 4.2.0) #> Matrix 1.4-1 2022-03-23 [1] CRAN (R 4.2.1) #> modeldata * 1.0.0 2022-07-01 [1] CRAN (R 4.2.0) #> modelr 0.1.9 2022-08-19 [1] CRAN (R 4.2.1) #> modeltools 0.2-23 2020-03-05 [1] CRAN (R 4.2.0) #> munsell 0.5.0 2018-06-12 [1] CRAN (R 4.2.0) #> nnet 7.3-17 2022-01-16 [1] CRAN (R 4.2.1) #> parallelly 1.32.1 2022-07-21 [1] CRAN (R 4.2.1) #> parsnip * 1.0.1.9000 2022-08-28 [1] Github (tidymodels/parsnip@e1eb30a) #> pillar 1.8.1 2022-08-19 [1] CRAN (R 4.2.1) #> pkgconfig 2.0.3 2019-09-22 [1] CRAN (R 4.2.0) #> prodlim 2019.11.13 2019-11-17 [1] CRAN (R 4.2.0) #> purrr * 0.3.4 2020-04-17 [1] CRAN (R 4.2.0) #> R.cache 0.16.0 2022-07-21 [1] CRAN (R 4.2.1) #> R.methodsS3 1.8.2 2022-06-13 [1] CRAN (R 4.2.0) #> R.oo 1.25.0 2022-06-12 [1] CRAN (R 4.2.0) #> R.utils 2.12.0 2022-06-28 [1] CRAN (R 4.2.0) #> R6 2.5.1 2021-08-19 [1] CRAN (R 4.2.0) #> Rcpp 1.0.9 2022-07-08 [1] CRAN (R 4.2.1) #> readr * 2.1.2 2022-01-30 [1] CRAN (R 4.2.0) #> readxl 1.4.1 2022-08-17 [1] CRAN (R 4.2.0) #> recipes * 1.0.1 2022-07-07 [1] CRAN (R 4.2.1) #> reprex 2.0.2 2022-08-17 [1] CRAN (R 4.2.0) #> rlang 1.0.4 2022-07-12 [1] CRAN (R 4.2.0) #> rmarkdown 2.16 2022-08-24 [1] CRAN (R 4.2.1) #> rpart 4.1.16 2022-01-24 [1] CRAN (R 4.2.1) #> rsample * 1.1.0 2022-08-08 [1] CRAN (R 4.2.1) #> rstudioapi 0.14 2022-08-22 [1] CRAN (R 4.2.1) #> rvest 1.0.3 2022-08-19 [1] CRAN (R 4.2.1) #> scales * 1.2.1 2022-08-20 [1] CRAN (R 4.2.1) #> sessioninfo 1.2.2 2021-12-06 [1] CRAN (R 4.2.0) #> stringi 1.7.8 2022-07-11 [1] CRAN (R 4.2.0) #> stringr * 1.4.1 2022-08-20 [1] CRAN (R 4.2.0) #> styler 1.7.0 2022-03-13 [1] CRAN (R 4.2.0) #> survival 3.4-0 2022-08-09 [1] CRAN (R 4.2.0) #> tibble * 3.1.8 2022-07-22 [1] CRAN (R 4.2.1) #> tidyclust * 0.0.0.9000 2022-08-28 [1] Github (EmilHvitfeldt/tidyclust@71a245a) #> tidymodels * 1.0.0 2022-07-13 [1] CRAN (R 4.2.0) #> tidyr * 1.2.0 2022-02-01 [1] CRAN (R 4.2.0) #> tidyselect 1.1.2 2022-02-21 [1] CRAN (R 4.2.0) #> tidyverse * 1.3.2 2022-07-18 [1] CRAN (R 4.2.1) #> timeDate 4021.104 2022-07-19 [1] CRAN (R 4.2.0) #> tune * 1.0.0 2022-07-07 [1] CRAN (R 4.2.1) #> tzdb 0.3.0 2022-03-28 [1] CRAN (R 4.2.0) #> utf8 1.2.2 2021-07-24 [1] CRAN (R 4.2.0) #> vctrs 0.4.1 2022-04-13 [1] CRAN (R 4.2.0) #> withr 2.5.0 2022-03-03 [1] CRAN (R 4.2.0) #> workflows * 1.0.0.9000 2022-08-28 [1] Github (tidymodels/workflows@099a735) #> workflowsets * 1.0.0 2022-07-12 [1] CRAN (R 4.2.0) #> xfun 0.32 2022-08-10 [1] CRAN (R 4.2.1) #> xml2 1.3.3 2021-11-30 [1] CRAN (R 4.2.0) #> yaml 2.3.5 2022-02-21 [1] CRAN (R 4.2.0) #> yardstick * 1.0.0 2022-06-06 [1] CRAN (R 4.2.0) #> #> [1] /Library/Frameworks/R.framework/Versions/4.2/Resources/library #> #> ────────────────────────────────────────────────────────────────────────────── ```