Labo-Lacourse / stepmix

A Python package following the scikit-learn API for model-based clustering and generalized mixture modeling (latent class/profile analysis) of continuous and categorical data. StepMix handles missing values through Full Information Maximum Likelihood (FIML) and provides multiple stepwise Expectation-Maximization (EM) estimation methods.
https://stepmix.readthedocs.io/en/latest/index.html
MIT License
54 stars 4 forks source link

init_params = 'kmeans' #54

Closed mohrezazali closed 7 months ago

mohrezazali commented 7 months ago

Hello. When I set init_params to 'kmeans' I run into a detailed error. The same model with the same data worked properly until yesterday, but today it doesn't work.

sachaMorin commented 7 months ago

I just tested init_params='kmeans' locally and it seems to work normally. Can you share the error message? And ideally some code reproducing the error?

Thanks,

sachaMorin commented 7 months ago

This is what I tested specifically.

from stepmix.stepmix import  StepMix
from stepmix.datasets import data_bakk_response
from sklearn.metrics import rand_score

X, Y, gt = data_bakk_response(1000, sep_level=.9, random_state=42)

model = StepMix(n_components=3, n_steps=1, measurement='binary', structural='gaussian_unit', random_state=42, init_params='kmeans')
model.fit(X, Y)
preds = model.predict(X, Y)
print(rand_score(gt, preds))
mohrezazali commented 7 months ago
from stepmix.utils import get_mixed_descriptor

X = df[['elect', 'treated', 'duration']]
y = df['hazard']

mixed_data, mixed_descriptor = get_mixed_descriptor(
    dataframe=df,
    continuous=['treated', 'duration'],
    binary=['elect'],
)

effect_model = StepMix(n_components=3, measurement=mixed_descriptor, n_init=3, init_params='kmeans', n_steps=3, 
                structural='binary', verbose=1, random_state=123)

effect_model.fit(mixed_data, y)
sachaMorin commented 7 months ago

Thank you! And can you also share the error? (copy pasting the error message would be fine)

mohrezazali commented 7 months ago

AttributeError Traceback (most recent call last) Cell In[211], line 17 6 mixed_data, mixed_descriptor = get_mixed_descriptor( 7 dataframe=df, 8 continuous=['treated', 'duration'], 9 binary=['elect'], 10 # categorical=['sepal width (q=3)'] 11 ) 14 effect_model = StepMixClassifier(n_components=3, measurement=mixed_descriptor,n_init=3, correction= 'BCH', init_params='kmeans', n_steps=3, 15 structural='binary', 16 verbose=1, random_state=123) ---> 17 effect_model.fit(mixed_data, y)

File ~/anaconda3/lib/python3.10/site-packages/stepmix/stepmix.py:771, in StepMix.fit(self, X, Y, sample_weight, y) 766 self.m_step_structural(resp, Y) 768 elif self.n_steps == 3 and self.correction == "BCH": 769 # Three-step estimation with BCH correction 770 # 1) Fit the measurement model --> 771 self.em(X) 773 # 2) Assign class probabilities 774 soft_resp = self.predict_proba_class(X)

File ~/anaconda3/lib/python3.10/site-packages/stepmix/stepmix.py:886, in StepMix.em(self, X, Y, sample_weight, freeze_measurement, log_emission_pm) 884 for init in tqdm_init: 885 if not freeze_measurement: --> 886 self._initialize_parameters(X, random_state) # Measurement model 888 if Y is not None: 889 self._initialize_parameters_structural( 890 Y, random_state 891 ) # Structural Model

File ~/anaconda3/lib/python3.10/site-packages/stepmix/stepmix.py:361, in StepMix._initialize_parameters(self, X, random_state) 355 if self.init_params == "kmeans": 356 resp = np.zeros((n_samples, self.n_components)) 357 label = ( 358 KMeans( 359 n_clusters=self.n_components, n_init=1, random_state=randomstate 360 ) --> 361 .fit(X) 362 .labels 363 ) 364 resp[np.arange(n_samples), label] = 1 365 elif self.init_params == "random":

File ~/anaconda3/lib/python3.10/site-packages/sklearn/base.py:1151, in _fit_context..decorator..wrapper(estimator, *args, *kwargs) 1144 estimator._validate_params() 1146 with config_context( 1147 skip_parameter_validation=( 1148 prefer_skip_nested_validation or global_skip_validation 1149 ) 1150 ): -> 1151 return fit_method(estimator, args, **kwargs)

File ~/anaconda3/lib/python3.10/site-packages/sklearn/cluster/_kmeans.py:1526, in KMeans.fit(self, X, y, sample_weight) 1523 print("Initialization complete") 1525 # run a k-means once -> 1526 labels, inertia, centers, niter = kmeans_single( 1527 X, 1528 sample_weight, 1529 centers_init, 1530 max_iter=self.max_iter, 1531 verbose=self.verbose, 1532 tol=self._tol, 1533 n_threads=self._n_threads, 1534 ) 1536 # determine if these results are the best so far 1537 # we chose a new run if it has a better inertia and the clustering is 1538 # different from the best so far (it's possible that the inertia is 1539 # slightly better even if the clustering is the same with potentially 1540 # permuted labels, due to rounding errors) 1541 if best_inertia is None or ( 1542 inertia < best_inertia 1543 and not _is_same_clustering(labels, best_labels, self.n_clusters) 1544 ):

File ~/anaconda3/lib/python3.10/site-packages/sklearn/cluster/_kmeans.py:688, in _kmeans_single_lloyd(X, sample_weight, centers_init, max_iter, verbose, tol, n_threads) 684 strict_convergence = False 686 # Threadpoolctl context to limit the number of threads in second level of 687 # nested parallelism (i.e. BLAS) to avoid oversubscription. --> 688 with threadpool_limits(limits=1, user_api="blas"): 689 for i in range(max_iter): 690 lloyd_iter( 691 X, 692 sample_weight, (...) 698 n_threads, 699 )

File ~/anaconda3/lib/python3.10/site-packages/sklearn/utils/fixes.py:72, in threadpool_limits(limits, user_api) 70 return controller.limit(limits=limits, user_api=user_api) 71 else: ---> 72 return threadpoolctl.threadpool_limits(limits=limits, user_api=user_api)

File ~/anaconda3/lib/python3.10/site-packages/threadpoolctl.py:171, in threadpool_limits.init(self, limits, user_api) 167 def init(self, limits=None, user_api=None): 168 self._limits, self._user_api, self._prefixes = \ 169 self._check_params(limits, user_api) --> 171 self._original_info = self._set_threadpool_limits()

File ~/anaconda3/lib/python3.10/site-packages/threadpoolctl.py:268, in threadpool_limits._set_threadpool_limits(self) 265 if self._limits is None: 266 return None --> 268 modules = _ThreadpoolInfo(prefixes=self._prefixes, 269 user_api=self._user_api) 270 for module in modules: 271 # self._limits is a dict {key: num_threads} where key is either 272 # a prefix or a user_api. If a module matches both, the limit 273 # corresponding to the prefix is chosed. 274 if module.prefix in self._limits:

File ~/anaconda3/lib/python3.10/site-packages/threadpoolctl.py:340, in _ThreadpoolInfo.init(self, user_api, prefixes, modules) 337 self.user_api = [] if user_api is None else user_api 339 self.modules = [] --> 340 self._load_modules() 341 self._warn_if_incompatible_openmp() 342 else:

File ~/anaconda3/lib/python3.10/site-packages/threadpoolctl.py:371, in _ThreadpoolInfo._load_modules(self) 369 """Loop through loaded libraries and store supported ones""" 370 if sys.platform == "darwin": --> 371 self._find_modules_with_dyld() 372 elif sys.platform == "win32": 373 self._find_modules_with_enum_process_module_ex()

File ~/anaconda3/lib/python3.10/site-packages/threadpoolctl.py:428, in _ThreadpoolInfo._find_modules_with_dyld(self) 425 filepath = filepath.decode("utf-8") 427 # Store the module if it is supported and selected --> 428 self._make_module_from_path(filepath)

File ~/anaconda3/lib/python3.10/site-packages/threadpoolctl.py:515, in _ThreadpoolInfo._make_module_from_path(self, filepath) 513 if prefix in self.prefixes or user_api in self.user_api: 514 module_class = globals()[module_class] --> 515 module = module_class(filepath, prefix, user_api, internal_api) 516 self.modules.append(module)

File ~/anaconda3/lib/python3.10/site-packages/threadpoolctl.py:606, in _Module.init(self, filepath, prefix, user_api, internal_api) 604 self.internal_api = internal_api 605 self._dynlib = ctypes.CDLL(filepath, mode=_RTLD_NOLOAD) --> 606 self.version = self.get_version() 607 self.num_threads = self.get_num_threads() 608 self._get_extra_info()

File ~/anaconda3/lib/python3.10/site-packages/threadpoolctl.py:646, in _OpenBLASModule.get_version(self) 643 get_config = getattr(self._dynlib, "openblas_get_config", 644 lambda: None) 645 get_config.restype = ctypes.c_char_p --> 646 config = get_config().split() 647 if config[0] == b"OpenBLAS": 648 return config[1].decode("utf-8")

AttributeError: 'NoneType' object has no attribute 'split'

sachaMorin commented 7 months ago

Can you run the following in your python env please and report the output?

import sklearn
import threadpoolctl
print(sklearn.__version__)
print(threadpoolctl.__version__)
mohrezazali commented 7 months ago

1.3.0 2.2.0

sachaMorin commented 7 months ago

Okay I can reproduce. The threadpoolctl version seems to be the problem. Can you try upgrading it?

pip install --upgrade threadpoolctl
mohrezazali commented 7 months ago

Solved! Great! Thanks for your help!

sachaMorin commented 7 months ago

Awesome! Leaving this link here for future reference. Also the sklearn issue.