tidymodels / tidyclust

A tidy unified interface to clustering models
https://tidyclust.tidymodels.org/
Other
109 stars 17 forks source link

[tglkmeans] - Better finding cluster centers #161

Open coforfe opened 1 year ago

coforfe commented 1 year ago

Hi Emil,

Thanks for your detailed description about the lower speed of tglkmeans ( #62 ).

The issue about the speed is something that can be corrected when using tglkmeans in a paraellized way. But for the relevant aspect of tglkmeans with respect to kmeans is that it offers a better cluster centers finding. tglkmeans is initialized in a different way than kmeansand it gets the right centers better than kmeans.

Please consider this code:

library(tidymodels)
library(tidyclust)
library(tglkmeans)
library(recipes)
library(tibble)

set.seed(1234)
data <- rbind(
  matrix(rnorm(100, mean = 1, sd = 0.3), ncol = 2),
  matrix(rnorm(100, mean = 2, sd = 0.3), ncol = 2),
  matrix(rnorm(100, mean = 3, sd = 0.3), ncol = 2),
  matrix(rnorm(100, mean = 4, sd = 0.3), ncol = 2),
  matrix(rnorm(100, mean = 5, sd = 0.3), ncol = 2)
)
colnames(data) <- c("x", "y")

data <- data %>% as.data.frame()

#------------------ SMALL --------------------
km          <- TGL_kmeans_tidy(data, 5)
kmstd       <- kmeans(data, 5)
kmstd$clust <- tibble(id = as.character(1:nrow(data)), clustkmstd = kmstd$cluster)

d <- left_join(km$cluster, kmstd$clust) %>% 
  mutate( compa = ifelse(clust == clustkmstd, 1, 0))

right_val <- sum(d$compa) * 100 / nrow(d)
error_val <- 100 - right_val
error_val

#------------------ MEDIUM --------------------
rec <- recipe(~., data = ames) |>
  step_dummy(all_nominal_predictors()) |>
  step_zv(all_predictors()) |>
  step_normalize(all_predictors())

ames_num <- prep(rec) |> 
  bake(new_data = NULL)

data <- ames_num

km          <- TGL_kmeans_tidy(data, 4)
kmstd       <- kmeans(data, 4)
kmstd$clust <- tibble(id = as.character(1:nrow(data)), clustkmstd = kmstd$cluster)

d <- left_join(km$cluster, kmstd$clust) %>% 
  mutate( compa = ifelse(clust == clustkmstd, 1, 0))

right_val <- sum(d$compa) * 100 / nrow(d)
error_val <- 100 - right_val
error_val

#------------------ LARGE --------------------
ames_num_big <- ames_num |>
  slice_sample(n = 1000000)

data <- ames_num_big

km          <- TGL_kmeans_tidy(data, 4)
kmstd       <- kmeans(data, 4)
kmstd$clust <- tibble(id = as.character(1:nrow(data)), clustkmstd = kmstd$cluster)

d <- left_join(km$cluster, kmstd$clust) %>% 
  mutate( compa = ifelse(clust == clustkmstd, 1, 0))

right_val <- sum(d$compa) * 100 / nrow(d)
error_val <- 100 - right_val
error_val

Which produces these results:

> #------------------ SMALL --------------------
> km          <- TGL_kmeans_tidy(data, 5)
Warning message:
In TGL_kmeans_tidy(data, 5) :
  Input doesn't have a column named "id". Using rownames instead.
> kmstd       <- kmeans(data, 5)
> kmstd$clust <- tibble(id = as.character(1:nrow(data)), clustkmstd = kmstd$cluster)
> 
> d <- left_join(km$cluster, kmstd$clust) %>% 
+   mutate( compa = ifelse(clust == clustkmstd, 1, 0))
Joining with `by = join_by(id)`
> 
> right_val <- sum(d$compa) * 100 / nrow(d)
> error_val <- 100 - right_val
> error_val
[1] 67.84983
> 
> 
> 
> #------------------ MEDIUM --------------------
> rec <- recipe(~., data = ames) |>
+   step_dummy(all_nominal_predictors()) |>
+   step_zv(all_predictors()) |>
+   step_normalize(all_predictors())
> 
> ames_num <- prep(rec) |> 
+   bake(new_data = NULL)
> 
> data <- ames_num
> 
> km          <- TGL_kmeans_tidy(data, 4)
Warning message:
In TGL_kmeans_tidy(data, 4) :
  Input doesn't have a column named "id". Using rownames instead.
> kmstd       <- kmeans(data, 4)
> kmstd$clust <- tibble(id = as.character(1:nrow(data)), clustkmstd = kmstd$cluster)
> 
> d <- left_join(km$cluster, kmstd$clust) %>% 
+   mutate( compa = ifelse(clust == clustkmstd, 1, 0))
Joining with `by = join_by(id)`
> 
> right_val <- sum(d$compa) * 100 / nrow(d)
> error_val <- 100 - right_val
> error_val
[1] 24.57338
> 
> 
> 
> #------------------ LARGE --------------------
> ames_num_big <- ames_num |>
+   slice_sample(n = 1000000)
> 
> data <- ames_num_big
> 
> km          <- TGL_kmeans_tidy(data, 4)
Warning message:
In TGL_kmeans_tidy(data, 4) :
  Input doesn't have a column named "id". Using rownames instead.
> kmstd       <- kmeans(data, 4)
> kmstd$clust <- tibble(id = as.character(1:nrow(data)), clustkmstd = kmstd$cluster)
> 
> d <- left_join(km$cluster, kmstd$clust) %>% 
+   mutate( compa = ifelse(clust == clustkmstd, 1, 0))
Joining with `by = join_by(id)`
> 
> right_val <- sum(d$compa) * 100 / nrow(d)
> error_val <- 100 - right_val
> error_val
[1] 95.93857
> 

Thanks again, Carlos.

EmilHvitfeldt commented 1 year ago

Thanks for letter me know!