BoostedRulesClassifier, BoostedRulesRegressor not working with sklearn cross_validate

jckkvs commented 1 year ago

The following code snippet results in an error.

from imodels import BoostedRulesRegressor, BoostedRulesClassifier
from sklearn.datasets import make_regression , make_classification
from sklearn.model_selection import cross_validate
X,y = make_regression()
cross_validate(BoostedRulesRegressor(),X,y)

X,y = make_classification()
cross_validate(BoostedRulesClassifier(),X,y)

The error

---------------------------------------------------------------------------
Empty                                     Traceback (most recent call last)
File ~\Anaconda3\envs\py310\lib\site-packages\joblib\parallel.py:862, in Parallel.dispatch_one_batch(self, iterator)
    861 try:
--> 862     tasks = self._ready_batches.get(block=False)
    863 except queue.Empty:
    864     # slice the iterator n_jobs * batchsize items at a time. If the
    865     # slice returns less than that, then the current batchsize puts
   (...)
    868     # accordingly to distribute evenly the last items between all
    869     # workers.

File ~\Anaconda3\envs\py310\lib\queue.py:168, in Queue.get(self, block, timeout)
    167     if not self._qsize():
--> 168         raise Empty
    169 elif timeout is None:

Empty: 

During handling of the above exception, another exception occurred:

TypeError                                 Traceback (most recent call last)
File ~\Anaconda3\envs\py310\lib\site-packages\imodels\rule_set\boosted_rules.py:80, in BoostedRulesRegressor.__init__(self, estimator, n_estimators, learning_rate, random_state)
     78 try: # sklearn version >= 1.2
     79     super().__init__(
---> 80         estimator=estimator(),
     81         n_estimators=n_estimators,
     82         learning_rate=learning_rate,
     83         random_state=random_state,
     84     )
     85 except: # sklearn version < 1.2

TypeError: 'DecisionTreeRegressor' object is not callable

During handling of the above exception, another exception occurred:

TypeError                                 Traceback (most recent call last)
Input In [3], in <module>
      1 X,y = make_regression()
----> 2 cross_validate(BoostedRulesRegressor(),X,y)

File ~\Anaconda3\envs\py310\lib\site-packages\sklearn\model_selection\_validation.py:266, in cross_validate(estimator, X, y, groups, scoring, cv, n_jobs, verbose, fit_params, pre_dispatch, return_train_score, return_estimator, error_score)
    263 # We clone the estimator to make sure that all the folds are
    264 # independent, and that it is pickle-able.
    265 parallel = Parallel(n_jobs=n_jobs, verbose=verbose, pre_dispatch=pre_dispatch)
--> 266 results = parallel(
    267     delayed(_fit_and_score)(
    268         clone(estimator),
    269         X,
    270         y,
    271         scorers,
    272         train,
    273         test,
    274         verbose,
    275         None,
    276         fit_params,
    277         return_train_score=return_train_score,
    278         return_times=True,
    279         return_estimator=return_estimator,
    280         error_score=error_score,
    281     )
    282     for train, test in cv.split(X, y, groups)
    283 )
    285 _warn_or_raise_about_fit_failures(results, error_score)
    287 # For callabe scoring, the return type is only know after calling. If the
    288 # return type is a dictionary, the error scores can now be inserted with
    289 # the correct key.

File ~\Anaconda3\envs\py310\lib\site-packages\sklearn\utils\parallel.py:63, in Parallel.__call__(self, iterable)
     58 config = get_config()
     59 iterable_with_config = (
     60     (_with_config(delayed_func, config), args, kwargs)
     61     for delayed_func, args, kwargs in iterable
     62 )
---> 63 return super().__call__(iterable_with_config)

File ~\Anaconda3\envs\py310\lib\site-packages\joblib\parallel.py:1085, in Parallel.__call__(self, iterable)
   1076 try:
   1077     # Only set self._iterating to True if at least a batch
   1078     # was dispatched. In particular this covers the edge
   (...)
   1082     # was very quick and its callback already dispatched all the
   1083     # remaining jobs.
   1084     self._iterating = False
-> 1085     if self.dispatch_one_batch(iterator):
   1086         self._iterating = self._original_iterator is not None
   1088     while self.dispatch_one_batch(iterator):

File ~\Anaconda3\envs\py310\lib\site-packages\joblib\parallel.py:873, in Parallel.dispatch_one_batch(self, iterator)
    870 n_jobs = self._cached_effective_n_jobs
    871 big_batch_size = batch_size * n_jobs
--> 873 islice = list(itertools.islice(iterator, big_batch_size))
    874 if len(islice) == 0:
    875     return False

File ~\Anaconda3\envs\py310\lib\site-packages\sklearn\utils\parallel.py:59, in <genexpr>(.0)
     54 # Capture the thread-local scikit-learn configuration at the time
     55 # Parallel.__call__ is issued since the tasks can be dispatched
     56 # in a different thread depending on the backend and on the value of
     57 # pre_dispatch and n_jobs.
     58 config = get_config()
---> 59 iterable_with_config = (
     60     (_with_config(delayed_func, config), args, kwargs)
     61     for delayed_func, args, kwargs in iterable
     62 )
     63 return super().__call__(iterable_with_config)

File ~\Anaconda3\envs\py310\lib\site-packages\sklearn\model_selection\_validation.py:268, in <genexpr>(.0)
    263 # We clone the estimator to make sure that all the folds are
    264 # independent, and that it is pickle-able.
    265 parallel = Parallel(n_jobs=n_jobs, verbose=verbose, pre_dispatch=pre_dispatch)
    266 results = parallel(
    267     delayed(_fit_and_score)(
--> 268         clone(estimator),
    269         X,
    270         y,
    271         scorers,
    272         train,
    273         test,
    274         verbose,
    275         None,
    276         fit_params,
    277         return_train_score=return_train_score,
    278         return_times=True,
    279         return_estimator=return_estimator,
    280         error_score=error_score,
    281     )
    282     for train, test in cv.split(X, y, groups)
    283 )
    285 _warn_or_raise_about_fit_failures(results, error_score)
    287 # For callabe scoring, the return type is only know after calling. If the
    288 # return type is a dictionary, the error scores can now be inserted with
    289 # the correct key.

File ~\Anaconda3\envs\py310\lib\site-packages\sklearn\base.py:90, in clone(estimator, safe)
     88 for name, param in new_object_params.items():
     89     new_object_params[name] = clone(param, safe=False)
---> 90 new_object = klass(**new_object_params)
     91 params_set = new_object.get_params(deep=False)
     93 # quick sanity check of the parameters of the clone

File ~\Anaconda3\envs\py310\lib\site-packages\imodels\rule_set\boosted_rules.py:87, in BoostedRulesRegressor.__init__(self, estimator, n_estimators, learning_rate, random_state)
     79     super().__init__(
     80         estimator=estimator(),
     81         n_estimators=n_estimators,
     82         learning_rate=learning_rate,
     83         random_state=random_state,
     84     )
     85 except: # sklearn version < 1.2
     86     super().__init__(
---> 87         base_estimator=estimator(),
     88         n_estimators=n_estimators,
     89         learning_rate=learning_rate,
     90         random_state=random_state,
     91     )
     92     self.estimator = estimator

TypeError: 'DecisionTreeRegressor' object is not callable

csinva commented 1 year ago

Will look closer into what this error is, but you may try just upgrading your scikit learn installation? pip install --upgrade scikit-learn

jckkvs commented 1 year ago

Will look closer into what this error is, but you may try just upgrading your scikit learn installation? pip install --upgrade scikit-learn

Thank you for reply and great library.

This error is not sklearn version dependent. This depends on the specification of sklearn.base.clone. As stated on the site below. https://scikit-learn.org/stable/developers/develop.html

There should be no logic, not even input validation, and the parameters should not be changed. The corresponding logic should be put where the parameters are used, typically in fit. The following is wrong:

For example, the following code will also cause same error.

clone(BoostedRulesRegressor())

csinva commented 1 year ago

Gotcha, okay I will merge in the PR you opened.

Thanks for fixing this issue!

csinva / imodels

BoostedRulesClassifier, BoostedRulesRegressor not working with sklearn cross_validate #176