rasbt / mlxtend

A library of extension and helper modules for Python's data analysis and machine learning libraries.
https://rasbt.github.io/mlxtend/
Other
4.88k stars 860 forks source link

Additional Regressors Fails #291

Closed BrianMiner closed 3 years ago

BrianMiner commented 6 years ago

I am trying to modify the example code, by adding on additional regressors. In this case ExtraTrees (same issue with xgboost and randomforest). Am I doing something wrong?

import numpy as np
from mlxtend.regressor import StackingCVRegressor
from sklearn.datasets import load_boston
from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor
from sklearn.model_selection import GridSearchCV

RANDOM_SEED=53222

X, y = load_boston(return_X_y=True)

ridge = Ridge()
lasso = Lasso()
rf = RandomForestRegressor(random_state=RANDOM_SEED)

et = ExtraTreesRegressor(random_state=RANDOM_SEED)

# The StackingCVRegressor uses scikit-learn's check_cv
# internally, which doesn't support a random seed. Thus
# NumPy's random seed need to be specified explicitely for
# deterministic behavior
np.random.seed(RANDOM_SEED)
stack = StackingCVRegressor(regressors=(lasso, ridge, et),
                            meta_regressor=rf, 
                            use_features_in_secondary=True)

grid = GridSearchCV(
    estimator=stack, 
    param_grid={
        'lasso__alpha': [x/5.0 for x in range(1, 10)],
        'et__n_estimators': [10,20],
        'ridge__alpha': [x/20.0 for x in range(1, 10)],
        'meta-randomforestregressor__n_estimators': [10, 100]
    }, 
    cv=5,
    refit=True
)

grid.fit(X, y)

print("Best: %f using %s" % (grid.best_score_, grid.best_params_))

---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-24-a8491f6a64e4> in <module>()
     40 )
     41 
---> 42 grid.fit(X, y)
     43 
     44 print("Best: %f using %s" % (grid.best_score_, grid.best_params_))

~/anaconda3/lib/python3.6/site-packages/sklearn/model_selection/_search.py in fit(self, X, y, groups, **fit_params)
    637                                   error_score=self.error_score)
    638           for parameters, (train, test) in product(candidate_params,
--> 639                                                    cv.split(X, y, groups)))
    640 
    641         # if one choose to see train score, "out" will contain train score info

~/anaconda3/lib/python3.6/site-packages/sklearn/externals/joblib/parallel.py in __call__(self, iterable)
    777             # was dispatched. In particular this covers the edge
    778             # case of Parallel used with an exhausted iterator.
--> 779             while self.dispatch_one_batch(iterator):
    780                 self._iterating = True
    781             else:

~/anaconda3/lib/python3.6/site-packages/sklearn/externals/joblib/parallel.py in dispatch_one_batch(self, iterator)
    623                 return False
    624             else:
--> 625                 self._dispatch(tasks)
    626                 return True
    627 

~/anaconda3/lib/python3.6/site-packages/sklearn/externals/joblib/parallel.py in _dispatch(self, batch)
    586         dispatch_timestamp = time.time()
    587         cb = BatchCompletionCallBack(dispatch_timestamp, len(batch), self)
--> 588         job = self._backend.apply_async(batch, callback=cb)
    589         self._jobs.append(job)
    590 

~/anaconda3/lib/python3.6/site-packages/sklearn/externals/joblib/_parallel_backends.py in apply_async(self, func, callback)
    109     def apply_async(self, func, callback=None):
    110         """Schedule a func to be run"""
--> 111         result = ImmediateResult(func)
    112         if callback:
    113             callback(result)

~/anaconda3/lib/python3.6/site-packages/sklearn/externals/joblib/_parallel_backends.py in __init__(self, batch)
    330         # Don't delay the application, to avoid keeping the input
    331         # arguments in memory
--> 332         self.results = batch()
    333 
    334     def get(self):

~/anaconda3/lib/python3.6/site-packages/sklearn/externals/joblib/parallel.py in __call__(self)
    129 
    130     def __call__(self):
--> 131         return [func(*args, **kwargs) for func, args, kwargs in self.items]
    132 
    133     def __len__(self):

~/anaconda3/lib/python3.6/site-packages/sklearn/externals/joblib/parallel.py in <listcomp>(.0)
    129 
    130     def __call__(self):
--> 131         return [func(*args, **kwargs) for func, args, kwargs in self.items]
    132 
    133     def __len__(self):

~/anaconda3/lib/python3.6/site-packages/sklearn/model_selection/_validation.py in _fit_and_score(estimator, X, y, scorer, train, test, verbose, parameters, fit_params, return_train_score, return_parameters, return_n_test_samples, return_times, error_score)
    442     train_scores = {}
    443     if parameters is not None:
--> 444         estimator.set_params(**parameters)
    445 
    446     start_time = time.time()

~/anaconda3/lib/python3.6/site-packages/sklearn/base.py in set_params(self, **params)
    272                                  'Check the list of available parameters '
    273                                  'with `estimator.get_params().keys()`.' %
--> 274                                  (key, self))
    275 
    276             if delim:

ValueError: Invalid parameter et for estimator StackingCVRegressor(cv=5,
          meta_regressor=RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
           oob_score=False, random_state=53222, verbose=0,
           warm_start=False),
          regressors=(Lasso(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=1000,
   normalize=False, positive=False, precompute=False, random_state=None,
   selection='cyclic', tol=0.0001, warm_start=False), Ridge(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=None,
   normalize=False, random_stat...stimators=10, n_jobs=1,
          oob_score=False, random_state=53222, verbose=0, warm_start=False)),
          shuffle=True, use_features_in_secondary=True). Check the list of available parameters with `estimator.get_params().keys()`.
rasbt commented 6 years ago

Hi, there,

I just tried your code and got it to work. There was a typo in the param_grid. I.e., instead of 'et' you need to use the name of the class in all lowercase. E.g.,

    param_grid={
        'lasso__alpha': [x/5.0 for x in range(1, 10)],
        'extratreesregressor__n_estimators': [10,20],
        'ridge__alpha': [x/20.0 for x in range(1, 10)],
        'meta-randomforestregressor__n_estimators': [10, 100]
    },  

If there are multiple instances of the same class used, e.g., 2 ExtraTreeRegressors, you would use

'extratreesregressor-1__n_estimators', 'extratreesregressor-2__n_estimators', etc.

Hope that helps!

BrianMiner commented 6 years ago

Oh that is extremely helpful. The documentation led me to believe that one used the same object names assigned (e.g. 'et' above). Further, the labeling of -1, -2 makes sense I was not sure how to differentiate the same class. Can we assume that -1, -2 etc will correspond to the ordering of the same class objects in the regressors argument - so that if we have

etA = ExtraTreesRegressor(random_state=RANDOM_SEED) etB = ExtraTreesRegressor(random_state=RANDOM_SEED)

stack = StackingCVRegressor(regressors=(lasso, etA, ridge, etB), meta_regressor=rf, use_features_in_secondary=True)

then param_grid={ 'lassoalpha': [x/5.0 for x in range(1, 10)], 'extratreesregressor-1n_estimators': [10,20], 'extratreesregressor-2__n_estimators': [100,1500], 'ridgealpha': [x/20.0 for x in range(1, 10)], 'meta-randomforestregressorn_estimators': [10, 100] },

extratreesregressor-1 then references etA and extratreesregressor-2 references etB?

rasbt commented 6 years ago

Can we assume that -1, -2 etc will correspond to the ordering of the same class objects in the regressors argument

Yes, that's correct! Hm, I remember that I documented this somewhere ... Although this issue seems to be "resolved" :), let's leave it open so that I don't forget to double-check where I documented this and also add a note to the StackingCVRegressor docs