csinva / imodels

Interpretable ML package 🔍 for concise, transparent, and accurate predictive modeling (sklearn-compatible).
https://csinva.io/imodels
MIT License
1.4k stars 124 forks source link

BoostedRulesClassifier, BoostedRulesRegressor not working with sklearn cross_validate #176

Closed jckkvs closed 1 year ago

jckkvs commented 1 year ago

The following code snippet results in an error.

from imodels import BoostedRulesRegressor, BoostedRulesClassifier
from sklearn.datasets import make_regression , make_classification
from sklearn.model_selection import cross_validate
X,y = make_regression()
cross_validate(BoostedRulesRegressor(),X,y)
X,y = make_classification()
cross_validate(BoostedRulesClassifier(),X,y)

The error

---------------------------------------------------------------------------
Empty                                     Traceback (most recent call last)
File ~\Anaconda3\envs\py310\lib\site-packages\joblib\parallel.py:862, in Parallel.dispatch_one_batch(self, iterator)
    861 try:
--> 862     tasks = self._ready_batches.get(block=False)
    863 except queue.Empty:
    864     # slice the iterator n_jobs * batchsize items at a time. If the
    865     # slice returns less than that, then the current batchsize puts
   (...)
    868     # accordingly to distribute evenly the last items between all
    869     # workers.

File ~\Anaconda3\envs\py310\lib\queue.py:168, in Queue.get(self, block, timeout)
    167     if not self._qsize():
--> 168         raise Empty
    169 elif timeout is None:

Empty: 

During handling of the above exception, another exception occurred:

TypeError                                 Traceback (most recent call last)
File ~\Anaconda3\envs\py310\lib\site-packages\imodels\rule_set\boosted_rules.py:80, in BoostedRulesRegressor.__init__(self, estimator, n_estimators, learning_rate, random_state)
     78 try: # sklearn version >= 1.2
     79     super().__init__(
---> 80         estimator=estimator(),
     81         n_estimators=n_estimators,
     82         learning_rate=learning_rate,
     83         random_state=random_state,
     84     )
     85 except: # sklearn version < 1.2

TypeError: 'DecisionTreeRegressor' object is not callable

During handling of the above exception, another exception occurred:

TypeError                                 Traceback (most recent call last)
Input In [3], in <module>
      1 X,y = make_regression()
----> 2 cross_validate(BoostedRulesRegressor(),X,y)

File ~\Anaconda3\envs\py310\lib\site-packages\sklearn\model_selection\_validation.py:266, in cross_validate(estimator, X, y, groups, scoring, cv, n_jobs, verbose, fit_params, pre_dispatch, return_train_score, return_estimator, error_score)
    263 # We clone the estimator to make sure that all the folds are
    264 # independent, and that it is pickle-able.
    265 parallel = Parallel(n_jobs=n_jobs, verbose=verbose, pre_dispatch=pre_dispatch)
--> 266 results = parallel(
    267     delayed(_fit_and_score)(
    268         clone(estimator),
    269         X,
    270         y,
    271         scorers,
    272         train,
    273         test,
    274         verbose,
    275         None,
    276         fit_params,
    277         return_train_score=return_train_score,
    278         return_times=True,
    279         return_estimator=return_estimator,
    280         error_score=error_score,
    281     )
    282     for train, test in cv.split(X, y, groups)
    283 )
    285 _warn_or_raise_about_fit_failures(results, error_score)
    287 # For callabe scoring, the return type is only know after calling. If the
    288 # return type is a dictionary, the error scores can now be inserted with
    289 # the correct key.

File ~\Anaconda3\envs\py310\lib\site-packages\sklearn\utils\parallel.py:63, in Parallel.__call__(self, iterable)
     58 config = get_config()
     59 iterable_with_config = (
     60     (_with_config(delayed_func, config), args, kwargs)
     61     for delayed_func, args, kwargs in iterable
     62 )
---> 63 return super().__call__(iterable_with_config)

File ~\Anaconda3\envs\py310\lib\site-packages\joblib\parallel.py:1085, in Parallel.__call__(self, iterable)
   1076 try:
   1077     # Only set self._iterating to True if at least a batch
   1078     # was dispatched. In particular this covers the edge
   (...)
   1082     # was very quick and its callback already dispatched all the
   1083     # remaining jobs.
   1084     self._iterating = False
-> 1085     if self.dispatch_one_batch(iterator):
   1086         self._iterating = self._original_iterator is not None
   1088     while self.dispatch_one_batch(iterator):

File ~\Anaconda3\envs\py310\lib\site-packages\joblib\parallel.py:873, in Parallel.dispatch_one_batch(self, iterator)
    870 n_jobs = self._cached_effective_n_jobs
    871 big_batch_size = batch_size * n_jobs
--> 873 islice = list(itertools.islice(iterator, big_batch_size))
    874 if len(islice) == 0:
    875     return False

File ~\Anaconda3\envs\py310\lib\site-packages\sklearn\utils\parallel.py:59, in <genexpr>(.0)
     54 # Capture the thread-local scikit-learn configuration at the time
     55 # Parallel.__call__ is issued since the tasks can be dispatched
     56 # in a different thread depending on the backend and on the value of
     57 # pre_dispatch and n_jobs.
     58 config = get_config()
---> 59 iterable_with_config = (
     60     (_with_config(delayed_func, config), args, kwargs)
     61     for delayed_func, args, kwargs in iterable
     62 )
     63 return super().__call__(iterable_with_config)

File ~\Anaconda3\envs\py310\lib\site-packages\sklearn\model_selection\_validation.py:268, in <genexpr>(.0)
    263 # We clone the estimator to make sure that all the folds are
    264 # independent, and that it is pickle-able.
    265 parallel = Parallel(n_jobs=n_jobs, verbose=verbose, pre_dispatch=pre_dispatch)
    266 results = parallel(
    267     delayed(_fit_and_score)(
--> 268         clone(estimator),
    269         X,
    270         y,
    271         scorers,
    272         train,
    273         test,
    274         verbose,
    275         None,
    276         fit_params,
    277         return_train_score=return_train_score,
    278         return_times=True,
    279         return_estimator=return_estimator,
    280         error_score=error_score,
    281     )
    282     for train, test in cv.split(X, y, groups)
    283 )
    285 _warn_or_raise_about_fit_failures(results, error_score)
    287 # For callabe scoring, the return type is only know after calling. If the
    288 # return type is a dictionary, the error scores can now be inserted with
    289 # the correct key.

File ~\Anaconda3\envs\py310\lib\site-packages\sklearn\base.py:90, in clone(estimator, safe)
     88 for name, param in new_object_params.items():
     89     new_object_params[name] = clone(param, safe=False)
---> 90 new_object = klass(**new_object_params)
     91 params_set = new_object.get_params(deep=False)
     93 # quick sanity check of the parameters of the clone

File ~\Anaconda3\envs\py310\lib\site-packages\imodels\rule_set\boosted_rules.py:87, in BoostedRulesRegressor.__init__(self, estimator, n_estimators, learning_rate, random_state)
     79     super().__init__(
     80         estimator=estimator(),
     81         n_estimators=n_estimators,
     82         learning_rate=learning_rate,
     83         random_state=random_state,
     84     )
     85 except: # sklearn version < 1.2
     86     super().__init__(
---> 87         base_estimator=estimator(),
     88         n_estimators=n_estimators,
     89         learning_rate=learning_rate,
     90         random_state=random_state,
     91     )
     92     self.estimator = estimator

TypeError: 'DecisionTreeRegressor' object is not callable
csinva commented 1 year ago

Will look closer into what this error is, but you may try just upgrading your scikit learn installation? pip install --upgrade scikit-learn

jckkvs commented 1 year ago

Will look closer into what this error is, but you may try just upgrading your scikit learn installation? pip install --upgrade scikit-learn

Thank you for reply and great library.

This error is not sklearn version dependent. This depends on the specification of sklearn.base.clone. As stated on the site below. https://scikit-learn.org/stable/developers/develop.html

For example, the following code will also cause same error.

clone(BoostedRulesRegressor())
csinva commented 1 year ago

Gotcha, okay I will merge in the PR you opened.

Thanks for fixing this issue!