Numerical problems lead to worse scores in comparison to sklearn

AlexanderFabisch / gmr

Gaussian Mixture Regression

BSD 3-Clause "New" or "Revised" License

168 stars 49 forks source link

Relevant examples at the and of this pull request: https://github.com/AlexanderFabisch/gmr/pull/28 from @mralbu

Did some additional comparisons with an experiment using sklearn.mixture.GaussianMixture machinery for fitting the regressor.
from sklego.mixture import GMMRegressor

np.set_printoptions(precision=4)

np.random.seed(2)

scores = []
for _ in range(10):
    gmr = GMMRegressor(n_components=2)
    gmr.fit(X, y)
    scores.append(gmr.score(X, y))
print(np.array(scores))
>> [0.8478 0.8478 0.8478 0.8478 0.8478 0.8478 0.8478 0.8478 0.8478 0.8478]

np.random.seed(2)

scores = []
for _ in range(10):
    gmr = GMMRegressor(n_components=2, init_params='random', max_iter=100)
    gmr.fit(X, y)
    scores.append(gmr.score(X, y))
print(np.array(scores))
>> [0.8157 0.8061 0.8221 0.8152 0.8221 0.8192 0.8479 0.8282 0.8251 0.7792]
Maybe using internal sklearn.mixture machinery might help ease numerical issues, though it would introduce sklearn as a hard dependency and might be out of scope. On the other hand, it would enable the introduction of other regressors such as BayesianGMMRegressor in an easy way, and would have familiar parameters (the same as in sklearn.mixture.GaussianMixture). Do you think exploring the use of sklearn.mixture inner workings would be interesting for gmr?

I guess the main reason why sklearn's GaussianMixture produces better results is their implementation of the expectation step: https://github.com/scikit-learn/scikit-learn/blob/138da7ea911274f34d28849337c2768d7e3a7a96/sklearn/mixture/_base.py#L462
I am not completely happy with the covariance shrinkage at the moment, this could be removed before the release of version 1.6
First observation: init_params="kmeans++" drastically improves stability of our results, it is not the default initialization though

from sklearn.base import BaseEstimator, RegressorMixin, MultiOutputMixin from sklearn.utils import check_X_y from sklearn.utils.validation import (check_is_fitted, check_array, FLOAT_DTYPES) from sklearn.mixture import GaussianMixture from gmr.gmm import GMM class GaussianMixtureRegressorSklearn(MultiOutputMixin, RegressorMixin, BaseEstimator): def __init__( self, n_components, verbose=0, random_state=None, R_diff=1e-4, n_iter=500, init_params="random"): self.n_components = n_components self.verbose = verbose self.random_state = random_state self.R_diff = R_diff self.n_iter = n_iter self.init_params = init_params def fit(self, X, y): gmm_ = GaussianMixture(self.n_components, init_params=self.init_params, max_iter=self.n_iter, tol=self.R_diff) X, y = check_X_y(X, y, estimator=gmm_, dtype=FLOAT_DTYPES, multi_output=True) if y.ndim == 1: y = np.expand_dims(y, 1) self.indices_ = np.arange(X.shape[1]) gmm_.fit(np.hstack((X, y))) self.gmm_ = GMM(self.n_components, priors=gmm_.weights_, means=gmm_.means_, covariances=gmm_.covariances_, verbose=self.verbose) return self def predict(self, X): check_is_fitted(self, ["gmm_", "indices_"]) X = check_array(X, estimator=self.gmm_, dtype=FLOAT_DTYPES) return self.gmm_.predict(self.indices_, X) import numpy as np from sklearn.datasets import load_boston from gmr.sklearn import GaussianMixtureRegressor X, y = load_boston(return_X_y=True) np.set_printoptions(precision=2, suppress=True) np.random.seed(2) scores = [] for _ in range(10): gmr = GaussianMixtureRegressor( n_components=2, verbose=10, R_diff=1e-7, init_params="kmeans++") gmr.fit(X, y) score = gmr.score(X, y) print(f"{score:.2f}") scores.append(score) print(np.array(scores)) np.random.seed(2) scores = [] for _ in range(10): gmr = GaussianMixtureRegressorSklearn( n_components=2, verbose=10, R_diff=1e-7, init_params="random") gmr.fit(X, y) score = gmr.score(X, y) print(f"{score:.2f}") scores.append(score) print(np.array(scores))

AlexanderFabisch / gmr

Numerical problems lead to worse scores in comparison to sklearn #30