Labo-Lacourse / stepmix

A Python package following the scikit-learn API for model-based clustering and generalized mixture modeling (latent class/profile analysis) of continuous and categorical data. StepMix handles missing values through Full Information Maximum Likelihood (FIML) and provides multiple stepwise Expectation-Maximization (EM) estimation methods.
https://stepmix.readthedocs.io/en/latest/index.html
MIT License
60 stars 4 forks source link

init_params = 'kmeans' #54

Closed mohrezazali closed 10 months ago

mohrezazali commented 10 months ago

Hello. When I set init_params to 'kmeans' I run into a detailed error. The same model with the same data worked properly until yesterday, but today it doesn't work.

sachaMorin commented 10 months ago

I just tested init_params='kmeans' locally and it seems to work normally. Can you share the error message? And ideally some code reproducing the error?

Thanks,

sachaMorin commented 10 months ago

This is what I tested specifically.

from stepmix.stepmix import  StepMix
from stepmix.datasets import data_bakk_response
from sklearn.metrics import rand_score

X, Y, gt = data_bakk_response(1000, sep_level=.9, random_state=42)

model = StepMix(n_components=3, n_steps=1, measurement='binary', structural='gaussian_unit', random_state=42, init_params='kmeans')
model.fit(X, Y)
preds = model.predict(X, Y)
print(rand_score(gt, preds))
mohrezazali commented 10 months ago
from stepmix.utils import get_mixed_descriptor

X = df[['elect', 'treated', 'duration']]
y = df['hazard']

mixed_data, mixed_descriptor = get_mixed_descriptor(
    dataframe=df,
    continuous=['treated', 'duration'],
    binary=['elect'],
)

effect_model = StepMix(n_components=3, measurement=mixed_descriptor, n_init=3, init_params='kmeans', n_steps=3, 
                structural='binary', verbose=1, random_state=123)

effect_model.fit(mixed_data, y)
sachaMorin commented 10 months ago

Thank you! And can you also share the error? (copy pasting the error message would be fine)

mohrezazali commented 10 months ago

AttributeError Traceback (most recent call last) Cell In[211], line 17 6 mixed_data, mixed_descriptor = get_mixed_descriptor( 7 dataframe=df, 8 continuous=['treated', 'duration'], 9 binary=['elect'], 10 # categorical=['sepal width (q=3)'] 11 ) 14 effect_model = StepMixClassifier(n_components=3, measurement=mixed_descriptor,n_init=3, correction= 'BCH', init_params='kmeans', n_steps=3, 15 structural='binary', 16 verbose=1, random_state=123) ---> 17 effect_model.fit(mixed_data, y)

File ~/anaconda3/lib/python3.10/site-packages/stepmix/stepmix.py:771, in StepMix.fit(self, X, Y, sample_weight, y) 766 self.m_step_structural(resp, Y) 768 elif self.n_steps == 3 and self.correction == "BCH": 769 # Three-step estimation with BCH correction 770 # 1) Fit the measurement model --> 771 self.em(X) 773 # 2) Assign class probabilities 774 soft_resp = self.predict_proba_class(X)

File ~/anaconda3/lib/python3.10/site-packages/stepmix/stepmix.py:886, in StepMix.em(self, X, Y, sample_weight, freeze_measurement, log_emission_pm) 884 for init in tqdm_init: 885 if not freeze_measurement: --> 886 self._initialize_parameters(X, random_state) # Measurement model 888 if Y is not None: 889 self._initialize_parameters_structural( 890 Y, random_state 891 ) # Structural Model

File ~/anaconda3/lib/python3.10/site-packages/stepmix/stepmix.py:361, in StepMix._initialize_parameters(self, X, random_state) 355 if self.init_params == "kmeans": 356 resp = np.zeros((n_samples, self.n_components)) 357 label = ( 358 KMeans( 359 n_clusters=self.n_components, n_init=1, random_state=randomstate 360 ) --> 361 .fit(X) 362 .labels 363 ) 364 resp[np.arange(n_samples), label] = 1 365 elif self.init_params == "random":

File ~/anaconda3/lib/python3.10/site-packages/sklearn/base.py:1151, in _fit_context..decorator..wrapper(estimator, *args, *kwargs) 1144 estimator._validate_params() 1146 with config_context( 1147 skip_parameter_validation=( 1148 prefer_skip_nested_validation or global_skip_validation 1149 ) 1150 ): -> 1151 return fit_method(estimator, args, **kwargs)

File ~/anaconda3/lib/python3.10/site-packages/sklearn/cluster/_kmeans.py:1526, in KMeans.fit(self, X, y, sample_weight) 1523 print("Initialization complete") 1525 # run a k-means once -> 1526 labels, inertia, centers, niter = kmeans_single( 1527 X, 1528 sample_weight, 1529 centers_init, 1530 max_iter=self.max_iter, 1531 verbose=self.verbose, 1532 tol=self._tol, 1533 n_threads=self._n_threads, 1534 ) 1536 # determine if these results are the best so far 1537 # we chose a new run if it has a better inertia and the clustering is 1538 # different from the best so far (it's possible that the inertia is 1539 # slightly better even if the clustering is the same with potentially 1540 # permuted labels, due to rounding errors) 1541 if best_inertia is None or ( 1542 inertia < best_inertia 1543 and not _is_same_clustering(labels, best_labels, self.n_clusters) 1544 ):

File ~/anaconda3/lib/python3.10/site-packages/sklearn/cluster/_kmeans.py:688, in _kmeans_single_lloyd(X, sample_weight, centers_init, max_iter, verbose, tol, n_threads) 684 strict_convergence = False 686 # Threadpoolctl context to limit the number of threads in second level of 687 # nested parallelism (i.e. BLAS) to avoid oversubscription. --> 688 with threadpool_limits(limits=1, user_api="blas"): 689 for i in range(max_iter): 690 lloyd_iter( 691 X, 692 sample_weight, (...) 698 n_threads, 699 )

File ~/anaconda3/lib/python3.10/site-packages/sklearn/utils/fixes.py:72, in threadpool_limits(limits, user_api) 70 return controller.limit(limits=limits, user_api=user_api) 71 else: ---> 72 return threadpoolctl.threadpool_limits(limits=limits, user_api=user_api)

File ~/anaconda3/lib/python3.10/site-packages/threadpoolctl.py:171, in threadpool_limits.init(self, limits, user_api) 167 def init(self, limits=None, user_api=None): 168 self._limits, self._user_api, self._prefixes = \ 169 self._check_params(limits, user_api) --> 171 self._original_info = self._set_threadpool_limits()

File ~/anaconda3/lib/python3.10/site-packages/threadpoolctl.py:268, in threadpool_limits._set_threadpool_limits(self) 265 if self._limits is None: 266 return None --> 268 modules = _ThreadpoolInfo(prefixes=self._prefixes, 269 user_api=self._user_api) 270 for module in modules: 271 # self._limits is a dict {key: num_threads} where key is either 272 # a prefix or a user_api. If a module matches both, the limit 273 # corresponding to the prefix is chosed. 274 if module.prefix in self._limits:

File ~/anaconda3/lib/python3.10/site-packages/threadpoolctl.py:340, in _ThreadpoolInfo.init(self, user_api, prefixes, modules) 337 self.user_api = [] if user_api is None else user_api 339 self.modules = [] --> 340 self._load_modules() 341 self._warn_if_incompatible_openmp() 342 else:

File ~/anaconda3/lib/python3.10/site-packages/threadpoolctl.py:371, in _ThreadpoolInfo._load_modules(self) 369 """Loop through loaded libraries and store supported ones""" 370 if sys.platform == "darwin": --> 371 self._find_modules_with_dyld() 372 elif sys.platform == "win32": 373 self._find_modules_with_enum_process_module_ex()

File ~/anaconda3/lib/python3.10/site-packages/threadpoolctl.py:428, in _ThreadpoolInfo._find_modules_with_dyld(self) 425 filepath = filepath.decode("utf-8") 427 # Store the module if it is supported and selected --> 428 self._make_module_from_path(filepath)

File ~/anaconda3/lib/python3.10/site-packages/threadpoolctl.py:515, in _ThreadpoolInfo._make_module_from_path(self, filepath) 513 if prefix in self.prefixes or user_api in self.user_api: 514 module_class = globals()[module_class] --> 515 module = module_class(filepath, prefix, user_api, internal_api) 516 self.modules.append(module)

File ~/anaconda3/lib/python3.10/site-packages/threadpoolctl.py:606, in _Module.init(self, filepath, prefix, user_api, internal_api) 604 self.internal_api = internal_api 605 self._dynlib = ctypes.CDLL(filepath, mode=_RTLD_NOLOAD) --> 606 self.version = self.get_version() 607 self.num_threads = self.get_num_threads() 608 self._get_extra_info()

File ~/anaconda3/lib/python3.10/site-packages/threadpoolctl.py:646, in _OpenBLASModule.get_version(self) 643 get_config = getattr(self._dynlib, "openblas_get_config", 644 lambda: None) 645 get_config.restype = ctypes.c_char_p --> 646 config = get_config().split() 647 if config[0] == b"OpenBLAS": 648 return config[1].decode("utf-8")

AttributeError: 'NoneType' object has no attribute 'split'

sachaMorin commented 10 months ago

Can you run the following in your python env please and report the output?

import sklearn
import threadpoolctl
print(sklearn.__version__)
print(threadpoolctl.__version__)
mohrezazali commented 10 months ago

1.3.0 2.2.0

sachaMorin commented 10 months ago

Okay I can reproduce. The threadpoolctl version seems to be the problem. Can you try upgrading it?

pip install --upgrade threadpoolctl
mohrezazali commented 10 months ago

Solved! Great! Thanks for your help!

sachaMorin commented 10 months ago

Awesome! Leaving this link here for future reference. Also the sklearn issue.