Closed daniepi closed 1 year ago
@daniepi can you make your example reproducible? You have python code then you use the python data in R and moreover you utilize data.table. Can you wrap your code which seems to compute kmeans in a function, upload the data as a file and show in equal (identical) or non-equal (non identical) R objects where the differences are?
Hi. Sorry for delay. Trying with reproducible example, fully in R
.
library(reticulate)
library(ClusterR)
library(data.table)
sk <- import("sklearn.datasets")
blobs <- sk$make_blobs(n_samples=20000L, n_features=3L, centers=3L, center_box=c(-3, 3), random_state=3L)
km_rcpp <- ClusterR::KMeans_rcpp(blobs[[1]], clusters = 2L, num_init = 3L, max_iters = 10L,
initializer = "kmeans++")
km_rcpp$clusters <- as.integer(km_rcpp$clusters)
clusters <- data.table(cluster = km_rcpp$clusters)
centers <- km_rcpp$centroids
# Calculate distances from the centers and get the minimum distance per record
for (i in seq_len(nrow(centers))) {
col <- paste0("center_", i)
clusters[, (col) := sqrt( colSums( (t(blobs[[1]]) - centers[i, ]) ** 2 ) )]
}
clusters[, closest := do.call(pmin, .SD), .SDcols = patterns("center")]
clusters[, cluster_closest := apply(.SD, 1, which.min), .SDcols = patterns("center")]
nrow(clusters[cluster != cluster_closest])
all.equal(clusters$cluster, clusters$cluster_closest)
I was also wondering, is there any specific reason for $clusters
object being return as double
instead of int
?
@daniepi thank you for the example code. with a few adjustments I see that there are differences between the KMeans_rcpp and your code for specific seeds (blobs are probably created based on a random sampling),
require(reticulate)
require(ClusterR)
require(data.table)
require(glue)
for (set_seed in 0:20) {
set_seed = as.integer(set_seed)
sk <- import("sklearn.datasets", convert = T)
blobs <- sk$make_blobs(n_samples=20000L,
n_features=3L,
centers=3L,
center_box=c(-3, 3),
random_state=set_seed)
# str(blobs)
data_blobs = blobs[[1]]
cluster_blobs = blobs[[2]]
km_rcpp <- ClusterR::KMeans_rcpp(data = data_blobs,
clusters = 2L,
num_init = 3L,
max_iters = 10L,
initializer = "kmeans++",
seed = set_seed)
# str(km_rcpp)
km_rcpp$clusters <- as.integer(km_rcpp$clusters)
clusters <- data.table(cluster = km_rcpp$clusters)
# clusters
centers <- km_rcpp$centroids
# centers
# Calculate distances from the centers and get the minimum distance per record
for (i in 1:nrow(centers)) {
col <- paste0("center_", i)
clusters[, (col) := sqrt( colSums( (t(blobs[[1]]) - centers[i, ]) ** 2 ) )]
}
# clusters
nrow(data_blobs) == nrow(clusters)
clusters[, closest := do.call(pmin, .SD), .SDcols = patterns("center")]
clusters[, cluster_closest := apply(.SD, 1, which.min), .SDcols = patterns("center")]
nrow(clusters[cluster != cluster_closest])
# all.equal(clusters$cluster, clusters$cluster_closest)
cols_clusts = c('cluster', 'cluster_closest')
df = table(clusters[, ..cols_clusts])
cat(glue::glue("seed: {set_seed} clusters-equal: {sum(diag(df)) == nrow(data_blobs)} difference: {nrow(data_blobs) - sum(diag(df))} unique-clusters-equal: {length(unique(clusters$cluster)) == length(unique(clusters$cluster_closest))}"), '\n')
}
# seed: 0 clusters-equal: FALSE difference: 6 unique-clusters-equal: TRUE
# seed: 1 clusters-equal: TRUE difference: 0 unique-clusters-equal: TRUE
# seed: 2 clusters-equal: FALSE difference: 5 unique-clusters-equal: TRUE
# seed: 3 clusters-equal: TRUE difference: 0 unique-clusters-equal: TRUE
# seed: 4 clusters-equal: TRUE difference: 0 unique-clusters-equal: TRUE
# seed: 5 clusters-equal: TRUE difference: 0 unique-clusters-equal: TRUE
# seed: 6 clusters-equal: FALSE difference: 26 unique-clusters-equal: TRUE
# seed: 7 clusters-equal: TRUE difference: 0 unique-clusters-equal: TRUE
# seed: 8 clusters-equal: TRUE difference: 0 unique-clusters-equal: TRUE
# seed: 9 clusters-equal: FALSE difference: 9 unique-clusters-equal: TRUE
# seed: 10 clusters-equal: TRUE difference: 0 unique-clusters-equal: TRUE
# seed: 11 clusters-equal: FALSE difference: 1 unique-clusters-equal: TRUE
# seed: 12 clusters-equal: TRUE difference: 0 unique-clusters-equal: TRUE
# seed: 13 clusters-equal: FALSE difference: 1 unique-clusters-equal: TRUE
# seed: 14 clusters-equal: FALSE difference: 1 unique-clusters-equal: TRUE
# seed: 15 clusters-equal: FALSE difference: 3 unique-clusters-equal: TRUE
# seed: 16 clusters-equal: TRUE difference: 0 unique-clusters-equal: TRUE
# seed: 17 clusters-equal: TRUE difference: 0 unique-clusters-equal: TRUE
# seed: 18 clusters-equal: TRUE difference: 0 unique-clusters-equal: TRUE
# seed: 19 clusters-equal: TRUE difference: 0 unique-clusters-equal: TRUE
# seed: 20 clusters-equal: TRUE difference: 0 unique-clusters-equal: TRUE
can you add a third function so that we can verify that there is a difference? you mentioned that you also tested with the stats::kmeans()
function?
Ok, so I took your code and replaced KMeans_rcpp
with stats::kmeans()
.
require(reticulate)
require(ClusterR)
require(data.table)
require(glue)
sk <- import("sklearn.datasets", convert = T)
for (set_seed in 0:20) {
set_seed = as.integer(set_seed)
blobs <- sk$make_blobs(n_samples=20000L,
n_features=3L,
centers=3L,
center_box=c(-3, 3),
random_state=set_seed)
data_blobs = blobs[[1]]
km_stats <- stats::kmeans(x = blobs[[1]], centers = 2L, iter.max = 10L, nstart = 3L)
clusters <- data.table(cluster = km_stats$cluster)
# clusters
centers <- km_stats$centers
# centers
# Calculate distances from the centers and get the minimum distance per record
for (i in 1:nrow(centers)) {
col <- paste0("center_", i)
clusters[, (col) := sqrt( colSums( (t(blobs[[1]]) - centers[i, ]) ** 2 ) )]
}
# clusters
nrow(data_blobs) == nrow(clusters)
clusters[, closest := do.call(pmin, .SD), .SDcols = patterns("center")]
clusters[, cluster_closest := apply(.SD, 1, which.min), .SDcols = patterns("center")]
nrow(clusters[cluster != cluster_closest])
# all.equal(clusters$cluster, clusters$cluster_closest)
cols_clusts = c('cluster', 'cluster_closest')
df = table(clusters[, ..cols_clusts])
cat(glue::glue("seed: {set_seed} clusters-equal: {sum(diag(df)) == nrow(data_blobs)} difference: {nrow(data_blobs) - sum(diag(df))} unique-clusters-equal: {length(unique(clusters$cluster)) == length(unique(clusters$cluster_closest))}"), '\n')
}
seed: 0 clusters-equal: TRUE difference: 0 unique-clusters-equal: TRUE
seed: 1 clusters-equal: TRUE difference: 0 unique-clusters-equal: TRUE
seed: 2 clusters-equal: TRUE difference: 0 unique-clusters-equal: TRUE
seed: 3 clusters-equal: TRUE difference: 0 unique-clusters-equal: TRUE
seed: 4 clusters-equal: TRUE difference: 0 unique-clusters-equal: TRUE
seed: 5 clusters-equal: TRUE difference: 0 unique-clusters-equal: TRUE
seed: 6 clusters-equal: TRUE difference: 0 unique-clusters-equal: TRUE
seed: 7 clusters-equal: TRUE difference: 0 unique-clusters-equal: TRUE
seed: 8 clusters-equal: TRUE difference: 0 unique-clusters-equal: TRUE
seed: 9 clusters-equal: TRUE difference: 0 unique-clusters-equal: TRUE
seed: 10 clusters-equal: TRUE difference: 0 unique-clusters-equal: TRUE
seed: 11 clusters-equal: TRUE difference: 0 unique-clusters-equal: TRUE
seed: 12 clusters-equal: TRUE difference: 0 unique-clusters-equal: TRUE
seed: 13 clusters-equal: TRUE difference: 0 unique-clusters-equal: TRUE
seed: 14 clusters-equal: TRUE difference: 0 unique-clusters-equal: TRUE
seed: 15 clusters-equal: TRUE difference: 0 unique-clusters-equal: TRUE
seed: 16 clusters-equal: TRUE difference: 0 unique-clusters-equal: TRUE
seed: 17 clusters-equal: TRUE difference: 0 unique-clusters-equal: TRUE
seed: 18 clusters-equal: TRUE difference: 0 unique-clusters-equal: TRUE
seed: 19 clusters-equal: TRUE difference: 0 unique-clusters-equal: TRUE
seed: 20 clusters-equal: TRUE difference: 0 unique-clusters-equal: TRUE
@daniepi give me a few days to have a look to the code and find out why there are differences compared to your code and to the base kmeans
@mlampros take your time. For nicely clusterable dummy data (like iris
) there are no discrepancies. However for other datasets, it seems like KMeans_rcpp()
assigns some of the observations to cluster whose centroid is not really the closest one.
@daniepi I took a look to my RcppArmadillo code this week (I wrote this code back in 2017 so I don't remember everything and I have to go through the code everytime).
It seems there is no bug in the code and the differences that you see between your data.table code and the ClusterR::KMeans_rcpp() function are related to the default parameters that you use in combination with the sklearn datasets.
If you take a look to the documentation of the sklearn Kmeans (which I also consulted when I wrote the RcppArmadillo code) the authors use a specific number of maximum iterations (max_iter=300), number of initializations (n_init='warn' or n_init=10) and algorithm (algorithm='lloyd' which I also use by default in the RcppArmadillo code).
Now, what you do in your data.table code is to take the output centroids of either the base::kmeans() or the ClusterR::KMeans_rcpp() and simply compute the clusters (and you use num_init = 3L and max_iters = 10L which is not enough for ClusterR::KMeans_rcpp() to converge). That means, you take the centroids and clusters of a non-converged algorithm and compare these clusters with your post-computated-clusters using your data.table code, which is not correct.
The following example code (for reproducibility) uses the same
for the
and I also compute the external validation metric 'rand_index' which actually is the accuracy between true and predicted clusters (I use as true clusters the clusters that you have computed based on the base-R-Kmeans algorithm),
require(reticulate)
require(ClusterR)
require(data.table)
require(glue)
# sklearn data and kmeans
sk = import("sklearn.datasets", convert = T)
sk_kms = import("sklearn.cluster", convert = T)
# iterate 20 times with different seed
for (set_seed in 0:20) {
set_seed = as.integer(set_seed)
blobs <- sk$make_blobs(n_samples=20000L,
n_features=3L,
centers=3L,
center_box=c(-3L, 3L),
random_state=set_seed)
data_blobs = blobs[[1]]
# input parameters based on sklearn.cluster.KMeans
# https://scikit-learn.org/stable/modules/generated/sklearn.cluster.KMeans.html
# https://scikit-learn.org/stable/modules/clustering.html#k-means
CENTERS = 2L
N_START = 10L
ITER_MAX = 300L
# base R kmeans
set.seed(seed = set_seed)
km_stats <- stats::kmeans(x = data_blobs,
centers = CENTERS,
iter.max = ITER_MAX,
nstart = N_START,
algorithm = "Lloyd")
centers <- km_stats$centers
# sklearn kmeans
# "auto" and "full" are deprecated and they will be removed in Scikit-Learn 1.3.
# They are both aliases for "lloyd"
kmeans_sk = sk_kms$KMeans(n_clusters = CENTERS,
init = 'k-means++',
max_iter = ITER_MAX,
n_init = N_START,
random_state = set_seed,
algorithm = 'auto')$fit(data_blobs)
centers_sk = kmeans_sk$cluster_centers_
# ClusterR kmeans
km_rcpp <- ClusterR::KMeans_rcpp(data = data_blobs,
clusters = CENTERS,
num_init = N_START,
max_iters = ITER_MAX,
initializer = "kmeans++",
seed = set_seed)
centers_rcpp = km_rcpp$centroids
# intialized data.table with base-kmeans clusters
clusters <- data.table(cluster = km_stats$cluster)
# Calculate distances from the centers and get the minimum distance per record
for (i in 1:nrow(centers)) {
col <- paste0("center_", i)
clusters[, (col) := sqrt( colSums( (t(data_blobs) - centers[i, ]) ** 2 ) )]
}
# computation of clusters based on computed centroids
clusters[, closest := do.call(pmin, .SD), .SDcols = patterns("center")]
clusters[, cluster_closest := apply(.SD, 1, which.min), .SDcols = patterns("center")]
# 'rand_index' is equal to accuracy
acc_true_preds_base = ClusterR::external_validation(true_labels = clusters$cluster_closest, clusters = km_stats$cluster, method = 'rand_index')
acc_true_preds_sklearn = ClusterR::external_validation(true_labels = clusters$cluster_closest, clusters = kmeans_sk$labels_, method = 'rand_index')
acc_true_preds_clusterR = ClusterR::external_validation(true_labels = clusters$cluster_closest, clusters = km_rcpp$clusters, method = 'rand_index')
cat(glue::glue("seed: {set_seed} accuracy_base_Kmeans: {acc_true_preds_base} accuracy_SKlearn_kmeans: {acc_true_preds_sklearn} accuracy_ClusterR_Kmeans: {acc_true_preds_clusterR}"), '\n')
# seed: 0 accuracy_base_Kmeans: 1 accuracy_SKlearn_kmeans: 0.9998000100005 accuracy_ClusterR_Kmeans: 1
# seed: 1 accuracy_base_Kmeans: 1 accuracy_SKlearn_kmeans: 0.9998000100005 accuracy_ClusterR_Kmeans: 1
# seed: 2 accuracy_base_Kmeans: 1 accuracy_SKlearn_kmeans: 0.998401200060003 accuracy_ClusterR_Kmeans: 1
# seed: 3 accuracy_base_Kmeans: 1 accuracy_SKlearn_kmeans: 0.998900550027501 accuracy_ClusterR_Kmeans: 1
# seed: 4 accuracy_base_Kmeans: 1 accuracy_SKlearn_kmeans: 1 accuracy_ClusterR_Kmeans: 1
# seed: 5 accuracy_base_Kmeans: 1 accuracy_SKlearn_kmeans: 0.9998000100005 accuracy_ClusterR_Kmeans: 1
# seed: 6 accuracy_base_Kmeans: 1 accuracy_SKlearn_kmeans: 0.993520801040052 accuracy_ClusterR_Kmeans: 1
# seed: 7 accuracy_base_Kmeans: 1 accuracy_SKlearn_kmeans: 0.9999 accuracy_ClusterR_Kmeans: 1
# seed: 8 accuracy_base_Kmeans: 1 accuracy_SKlearn_kmeans: 1 accuracy_ClusterR_Kmeans: 1
# seed: 9 accuracy_base_Kmeans: 1 accuracy_SKlearn_kmeans: 0.998600910045502 accuracy_ClusterR_Kmeans: 1
# seed: 10 accuracy_base_Kmeans: 1 accuracy_SKlearn_kmeans: 0.983930416520826 accuracy_ClusterR_Kmeans: 1
# seed: 11 accuracy_base_Kmeans: 1 accuracy_SKlearn_kmeans: 0.998401200060003 accuracy_ClusterR_Kmeans: 1
# seed: 12 accuracy_base_Kmeans: 1 accuracy_SKlearn_kmeans: 1 accuracy_ClusterR_Kmeans: 1
# seed: 13 accuracy_base_Kmeans: 1 accuracy_SKlearn_kmeans: 0.999200280014001 accuracy_ClusterR_Kmeans: 1
# seed: 14 accuracy_base_Kmeans: 1 accuracy_SKlearn_kmeans: 1 accuracy_ClusterR_Kmeans: 1
# seed: 15 accuracy_base_Kmeans: 1 accuracy_SKlearn_kmeans: 0.9998000100005 accuracy_ClusterR_Kmeans: 1
# seed: 16 accuracy_base_Kmeans: 1 accuracy_SKlearn_kmeans: 0.999600060003 accuracy_ClusterR_Kmeans: 1
# seed: 17 accuracy_base_Kmeans: 1 accuracy_SKlearn_kmeans: 0.9994001500075 accuracy_ClusterR_Kmeans: 1
# seed: 18 accuracy_base_Kmeans: 1 accuracy_SKlearn_kmeans: 0.9999 accuracy_ClusterR_Kmeans: 1
# seed: 19 accuracy_base_Kmeans: 1 accuracy_SKlearn_kmeans: 1 accuracy_ClusterR_Kmeans: 1
# seed: 20 accuracy_base_Kmeans: 1 accuracy_SKlearn_kmeans: 0.9997000300015 accuracy_ClusterR_Kmeans: 1
}
As you see now in the output of this for-loop for the recommended number of initializations and maximum iterations the 'ClusterR::KMeans_rcpp()' converges and gives an accuracy of 1.0 for all 20 sample datasets when compared to the clusters computed with your data.table code based on the centroids of the base-R-kmeans algorithm.
I'll close the issue for now, feel free to re-open it in case the code does not work as expected
Hi. What I observe when using my dataset (and also dummy dataset create via
sklearn
make_blobs
) is that some of the final cluster assignments assigned byKMeans_rcpp
are not really to the closest centroid.stats::kmeans()
does not seem to have this issue. The clusters it assigns are really to the closest centroid.I first thought it has something to do with
initializer = "kmeans++"
, but the same applies to initializer = "random". Withiris
, no difference betweenstats::kmeans()
andKMeans_rcpp
.I create dataset with Python:
Rest is in
R