bmurauer / pipelinehelper

scikit-helper to hot-swap pipeline elements
GNU General Public License v3.0
21 stars 9 forks source link

Your model does not support decision_function #7

Closed browshanravan closed 4 years ago

browshanravan commented 4 years ago

It's me again :)

The package works great however when trying to use GaussianNB(), ExtraTreesClassifier() or RandomForestClassifier(), within the PipelineHelper(), I get the following error.

It works fine for other classifiers I have tried!


The above exception was the direct cause of the following exception:

ValueError                                Traceback (most recent call last)
<ipython-input-67-63d988a61196> in <module>
     41                           scoring="roc_auc", cv=skf)
     42 
---> 43 grid.fit(X_train, y_train)

~/PycharmProjects/Data School/DS_Pandas_tut/venv/lib/python3.7/site-packages/sklearn/utils/validation.py in inner_f(*args, **kwargs)
     71                           FutureWarning)
     72         kwargs.update({k: arg for k, arg in zip(sig.parameters, args)})
---> 73         return f(**kwargs)
     74     return inner_f
     75 

~/PycharmProjects/Data School/DS_Pandas_tut/venv/lib/python3.7/site-packages/sklearn/model_selection/_search.py in fit(self, X, y, groups, **fit_params)
    734                 return results
    735 
--> 736             self._run_search(evaluate_candidates)
    737 
    738         # For multi-metric evaluation, store the best_index_, best_params_ and

~/PycharmProjects/Data School/DS_Pandas_tut/venv/lib/python3.7/site-packages/sklearn/model_selection/_search.py in _run_search(self, evaluate_candidates)
   1186     def _run_search(self, evaluate_candidates):
   1187         """Search all candidates in param_grid"""
-> 1188         evaluate_candidates(ParameterGrid(self.param_grid))
   1189 
   1190 

~/PycharmProjects/Data School/DS_Pandas_tut/venv/lib/python3.7/site-packages/sklearn/model_selection/_search.py in evaluate_candidates(candidate_params)
    713                                for parameters, (train, test)
    714                                in product(candidate_params,
--> 715                                           cv.split(X, y, groups)))
    716 
    717                 if len(out) < 1:

~/PycharmProjects/Data School/DS_Pandas_tut/venv/lib/python3.7/site-packages/joblib/parallel.py in __call__(self, iterable)
   1040 
   1041             with self._backend.retrieval_context():
-> 1042                 self.retrieve()
   1043             # Make sure that we get a last message telling us we are done
   1044             elapsed_time = time.time() - self._start_time

~/PycharmProjects/Data School/DS_Pandas_tut/venv/lib/python3.7/site-packages/joblib/parallel.py in retrieve(self)
    919             try:
    920                 if getattr(self._backend, 'supports_timeout', False):
--> 921                     self._output.extend(job.get(timeout=self.timeout))
    922                 else:
    923                     self._output.extend(job.get())

~/PycharmProjects/Data School/DS_Pandas_tut/venv/lib/python3.7/site-packages/joblib/_parallel_backends.py in wrap_future_result(future, timeout)
    538         AsyncResults.get from multiprocessing."""
    539         try:
--> 540             return future.result(timeout=timeout)
    541         except CfTimeoutError:
    542             raise TimeoutError()

/Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/concurrent/futures/_base.py in result(self, timeout)
    426                 raise CancelledError()
    427             elif self._state == FINISHED:
--> 428                 return self.__get_result()
    429 
    430             self._condition.wait(timeout)

/Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/concurrent/futures/_base.py in __get_result(self)
    382     def __get_result(self):
    383         if self._exception:
--> 384             raise self._exception
    385         else:
    386             return self._result

ValueError: Your model (GaussianNB()) does not support decision_function
bmurauer commented 4 years ago

Thank you for your report, can you post the full code that causes this error?

browshanravan commented 4 years ago
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from pipelinehelper import PipelineHelper
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer 
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import GridSearchCV

dtypes = {'Sex':'category', 'Embarked':'category'}
desired_columns = ["Survived", "Pclass", "Sex", "Age", "SibSp", "Parch", "Fare", "Embarked"]
titanic = pd.read_csv("http://bit.ly/kaggletrain", usecols= desired_columns, dtype= dtypes)
SEED_VALUE=42
X = titanic.drop("Survived", axis=1)
y = titanic["Survived"]

fill_imputer_num = ["Age"]
fill_imputer_num_pipeline = Pipeline(steps=
    [
    ("num_imputer", SimpleImputer(missing_values=np.nan, strategy="mean")),
    ]
)

fill_impute_cat = ['Sex', 'Embarked']
fill_imputer_cat_pipeline = Pipeline(steps=
    [
    ("cat_imputer", SimpleImputer(missing_values=np.nan, strategy='most_frequent', fill_value='missing')),
    ("onehot", OneHotEncoder())
    ]
)

preprosessor = ColumnTransformer(transformers=
    [
    ("N_Fimp", fill_imputer_num_pipeline, fill_imputer_num),
    ("C_Fimp", fill_imputer_cat_pipeline, fill_impute_cat),
    ]
)

pipe = Pipeline(steps=
    [
        ("preprosessor", preprosessor),
        ('clf', PipelineHelper([
            ('ExtraTreesClassifier', ExtraTreesClassifier(n_jobs=-1, random_state=SEED_VALUE)),
            ('RandomForestClassifier', RandomForestClassifier(n_jobs=-1, random_state=SEED_VALUE)),
            ("GaussianNB", GaussianNB())
    ])),
    ]
)

param_grid = {
    'clf__selected_model': pipe.named_steps['clf'].generate({
        "ExtraTreesClassifier__n_estimators": [16, 32, 100, 300, 500],
        "ExtraTreesClassifier__criterion": ["gini", "entropy"],
        "ExtraTreesClassifier__warm_start": [True, False],

        "RandomForestClassifier__n_estimators": [16, 32, 100, 300, 500],
        "RandomForestClassifier__criterion": ["gini", "entropy"],
        "RandomForestClassifier__warm_start": [True, False],
    }),
}

grid = GridSearchCV(estimator= pipe, param_grid= param_grid, n_jobs=-1, 
                          scoring="roc_auc", cv=10)

grid.fit(X, y)
print(grid.best_score_)