automl / auto-sklearn

Automated Machine Learning with scikit-learn
https://automl.github.io/auto-sklearn
BSD 3-Clause "New" or "Revised" License
7.58k stars 1.28k forks source link

automl.refit() #250

Closed glebmikha closed 6 years ago

glebmikha commented 7 years ago

I get the following error when running refit( X , y ):

`TypeError                                 Traceback (most recent call last)
<ipython-input-7-aa97446b3d2b> in <module>()
----> 1 automl.refit(X.copy().values, y.copy().values)

/root/anaconda2/envs/automl3/lib/python3.5/site-packages/autosklearn/estimators.py in refit(self, X, y)
     46 
     47         """
---> 48         return self._automl.refit(X, y)
     49 
     50     def fit_ensemble(self, y, task=None, metric=None, precision='32',

/root/anaconda2/envs/automl3/lib/python3.5/site-packages/autosklearn/estimators.py in refit(self, X, y)
     46 
     47         """
---> 48         return self._automl.refit(X, y)
     49 
     50     def fit_ensemble(self, y, task=None, metric=None, precision='32',

/root/anaconda2/envs/automl3/lib/python3.5/site-packages/autosklearn/automl.py in refit(self, X, y)
    461                         with warnings.catch_warnings():
    462                             warnings.showwarning = send_warnings_to_log
--> 463                             model.fit(X.copy(), y.copy())
    464                         break
    465                     except ValueError:

/root/anaconda2/envs/automl3/lib/python3.5/site-packages/autosklearn/pipeline/base.py in fit(self, X, y, fit_params)
     86             a classification algorithm first.
     87         """
---> 88         X, fit_params = self.pre_transform(X, y, fit_params=fit_params)
     89         self.fit_estimator(X, y, **fit_params)
     90         return self

/root/anaconda2/envs/automl3/lib/python3.5/site-packages/autosklearn/pipeline/classification.py in pre_transform(self, X, y, fit_params)
    100 
    101         X, fit_params = super(SimpleClassificationPipeline, self).pre_transform(
--> 102             X, y, fit_params=fit_params)
    103 
    104         return X, fit_params

/root/anaconda2/envs/automl3/lib/python3.5/site-packages/autosklearn/pipeline/base.py in pre_transform(self, X, y, fit_params)
     96             fit_params = {key.replace(":", "__"): value for key, value in
     97                           fit_params.items()}
---> 98         X, fit_params = self._pre_transform(X, y, **fit_params)
     99         return X, fit_params
    100 

/root/anaconda2/envs/automl3/lib/python3.5/site-packages/sklearn/pipeline.py in _pre_transform(self, X, y, **fit_params)
    145                 Xt = transform.fit_transform(Xt, y, **fit_params_steps[name])
    146             else:
--> 147                 Xt = transform.fit(Xt, y, **fit_params_steps[name]) \
    148                               .transform(Xt)
    149         return Xt, fit_params_steps[self.steps[-1][0]]

/root/anaconda2/envs/automl3/lib/python3.5/site-packages/autosklearn/pipeline/components/base.py in fit(self, X, y, **kwargs)
    377         if kwargs is None:
    378             kwargs = {}
--> 379         return self.choice.fit(X, y, **kwargs)
    380 
    381     def predict(self, X):

/root/anaconda2/envs/automl3/lib/python3.5/site-packages/autosklearn/pipeline/components/feature_preprocessing/random_trees_embedding.py in fit(self, X, Y)
     32             self.max_leaf_nodes = None
     33         else:
---> 34             self.max_leaf_nodes = int(self.max_leaf_nodes)
     35 
     36         self.preprocessor = sklearn.ensemble.RandomTreesEmbedding(

TypeError: int() argument must be a string, a bytes-like object or a number, not 'NoneType'`
mfeurer commented 7 years ago

Could you please provide me with the steps necessary to reproduce this issue?

mfeurer commented 7 years ago

Could you please check with the latest version of auto-sklearn (0.2.0) whether this is still an issue?

christophe-rannou commented 7 years ago

Hi there,

I get almost the same mistake using auto-sklearn in version 0.2.0 except this time it involves gradient_boosting.py :

---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
<ipython-input-12-adfd28801c18> in <module>()
     34 
     35 
---> 36     cls.refit(Xk, yk)
     37 
     38 

/usr/local/lib/python3.4/dist-packages/autosklearn/estimators.py in refit(self, X, y)
     48 
     49         """
---> 50         return self._automl.refit(X, y)
     51 
     52     def fit_ensemble(self, y, task=None, metric=None, precision='32',

/usr/local/lib/python3.4/dist-packages/autosklearn/estimators.py in refit(self, X, y)
     48 
     49         """
---> 50         return self._automl.refit(X, y)
     51 
     52     def fit_ensemble(self, y, task=None, metric=None, precision='32',

/usr/local/lib/python3.4/dist-packages/autosklearn/automl.py in refit(self, X, y)
    495                         with warnings.catch_warnings():
    496                             warnings.showwarning = send_warnings_to_log
--> 497                             model.fit(X.copy(), y.copy())
    498                         break
    499                     except ValueError as e:

/usr/local/lib/python3.4/dist-packages/autosklearn/pipeline/base.py in fit(self, X, y, fit_params)
     87         """
     88         X, fit_params = self.fit_transformer(X, y, fit_params=fit_params)
---> 89         self.fit_estimator(X, y, **fit_params)
     90         return self
     91 

/usr/local/lib/python3.4/dist-packages/autosklearn/pipeline/base.py in fit_estimator(self, X, y, **fit_params)
    102         if fit_params is None:
    103             fit_params = {}
--> 104         self.steps[-1][-1].fit(X, y, **fit_params)
    105         return self
    106 

/usr/local/lib/python3.4/dist-packages/autosklearn/pipeline/components/base.py in fit(self, X, y, **kwargs)
    377         if kwargs is None:
    378             kwargs = {}
--> 379         return self.choice.fit(X, y, **kwargs)
    380 
    381     def predict(self, X):

/usr/local/lib/python3.4/dist-packages/autosklearn/pipeline/components/classification/gradient_boosting.py in fit(self, X, y, sample_weight, refit)
     34     def fit(self, X, y, sample_weight=None, refit=False):
     35         self.iterative_fit(X, y, n_iter=1, sample_weight=sample_weight,
---> 36                            refit=True)
     37         while not self.configuration_fully_fitted():
     38             self.iterative_fit(X, y, n_iter=1, sample_weight=sample_weight)

/usr/local/lib/python3.4/dist-packages/autosklearn/pipeline/components/classification/gradient_boosting.py in iterative_fit(self, X, y, sample_weight, n_iter, refit)
     66                 self.max_leaf_nodes = None
     67             else:
---> 68                 self.max_leaf_nodes = int(self.max_leaf_nodes)
     69             self.verbose = int(self.verbose)
     70 

TypeError: int() argument must be a string or a number, not 'NoneType'

What seems weird is that in the list of models (via cls.show_models()) I cannot find any gradient_booting :

[(0.300000, SimpleClassificationPipeline({'classifier:adaboost:algorithm': 'SAMME', 'preprocessor:fast_ica:fun': 'logcosh', 'preprocessor:fast_ica:algorithm': 'deflation', 'preprocessor:__choice__': 'fast_ica', 'rescaling:__choice__': 'standardize', 'classifier:__choice__': 'adaboost', 'classifier:adaboost:n_estimators': 491, 'classifier:adaboost:max_depth': 6, 'balancing:strategy': 'weighting', 'one_hot_encoding:use_minimum_fraction': 'False', 'preprocessor:fast_ica:n_components': 1995, 'preprocessor:fast_ica:whiten': 'True', 'classifier:adaboost:learning_rate': 1.4736698696730515, 'imputation:strategy': 'median'},
dataset_properties={
  'target_type': 'classification',
  'multiclass': False,
  'signed': False,
  'sparse': False,
  'task': 1,
  'multilabel': False})),
(0.140000, SimpleClassificationPipeline({'classifier:adaboost:algorithm': 'SAMME.R', 'preprocessor:liblinear_svc_preprocessor:tol': 0.01097852951252465, 'preprocessor:liblinear_svc_preprocessor:loss': 'squared_hinge', 'preprocessor:liblinear_svc_preprocessor:penalty': 'l1', 'preprocessor:__choice__': 'liblinear_svc_preprocessor', 'rescaling:__choice__': 'standardize', 'classifier:__choice__': 'adaboost', 'classifier:adaboost:n_estimators': 232, 'classifier:adaboost:max_depth': 1, 'preprocessor:liblinear_svc_preprocessor:dual': 'False', 'preprocessor:liblinear_svc_preprocessor:multi_class': 'ovr', 'preprocessor:liblinear_svc_preprocessor:intercept_scaling': 1, 'balancing:strategy': 'none', 'one_hot_encoding:use_minimum_fraction': 'False', 'preprocessor:liblinear_svc_preprocessor:C': 18857.462123060137, 'preprocessor:liblinear_svc_preprocessor:fit_intercept': 'True', 'classifier:adaboost:learning_rate': 0.10000000000000002, 'imputation:strategy': 'most_frequent'},
dataset_properties={
  'target_type': 'classification',
  'multiclass': False,
  'signed': False,
  'sparse': False,
  'task': 1,
  'multilabel': False})),
(0.100000, SimpleClassificationPipeline({'classifier:extra_trees:min_weight_fraction_leaf': 0.0, 'preprocessor:feature_agglomeration:pooling_func': 'mean', 'classifier:extra_trees:max_features': 1.0, 'classifier:extra_trees:criterion': 'gini', 'classifier:extra_trees:min_samples_split': 2, 'preprocessor:__choice__': 'feature_agglomeration', 'rescaling:__choice__': 'none', 'classifier:extra_trees:n_estimators': 100, 'classifier:extra_trees:min_samples_leaf': 1, 'classifier:extra_trees:max_depth': 'None', 'classifier:__choice__': 'extra_trees', 'preprocessor:feature_agglomeration:n_clusters': 25, 'balancing:strategy': 'weighting', 'classifier:extra_trees:bootstrap': 'False', 'preprocessor:feature_agglomeration:affinity': 'euclidean', 'one_hot_encoding:use_minimum_fraction': 'False', 'preprocessor:feature_agglomeration:linkage': 'ward', 'imputation:strategy': 'mean'},
dataset_properties={
  'target_type': 'classification',
  'multiclass': False,
  'signed': False,
  'sparse': False,
  'task': 1,
  'multilabel': False})),
(0.080000, SimpleClassificationPipeline({'classifier:libsvm_svc:kernel': 'rbf', 'classifier:libsvm_svc:max_iter': -1, 'classifier:libsvm_svc:tol': 0.07228314195704957, 'classifier:libsvm_svc:C': 133.619004912714, 'preprocessor:__choice__': 'no_preprocessing', 'rescaling:__choice__': 'minmax', 'classifier:__choice__': 'libsvm_svc', 'classifier:libsvm_svc:shrinking': 'False', 'balancing:strategy': 'none', 'classifier:libsvm_svc:gamma': 1.421889512788389, 'one_hot_encoding:use_minimum_fraction': 'False', 'imputation:strategy': 'median'},
dataset_properties={
  'target_type': 'classification',
  'multiclass': False,
  'signed': False,
  'sparse': False,
  'task': 1,
  'multilabel': False})),
(0.080000, SimpleClassificationPipeline({'classifier:adaboost:algorithm': 'SAMME', 'classifier:adaboost:max_depth': 6, 'balancing:strategy': 'none', 'classifier:adaboost:learning_rate': 1.4736698696730515, 'classifier:adaboost:n_estimators': 491, 'preprocessor:__choice__': 'no_preprocessing', 'rescaling:__choice__': 'none', 'one_hot_encoding:use_minimum_fraction': 'False', 'classifier:__choice__': 'adaboost', 'imputation:strategy': 'median'},
dataset_properties={
  'target_type': 'classification',
  'multiclass': False,
  'signed': False,
  'sparse': False,
  'task': 1,
  'multilabel': False})),
(0.060000, SimpleClassificationPipeline({'classifier:decision_tree:criterion': 'entropy', 'classifier:decision_tree:splitter': 'best', 'preprocessor:feature_agglomeration:affinity': 'euclidean', 'preprocessor:feature_agglomeration:pooling_func': 'median', 'classifier:decision_tree:max_depth': 1.2028353286235083, 'preprocessor:__choice__': 'feature_agglomeration', 'rescaling:__choice__': 'none', 'classifier:__choice__': 'decision_tree', 'classifier:decision_tree:min_samples_split': 3, 'preprocessor:feature_agglomeration:n_clusters': 303, 'balancing:strategy': 'none', 'preprocessor:feature_agglomeration:linkage': 'complete', 'classifier:decision_tree:min_weight_fraction_leaf': 0.0, 'classifier:decision_tree:max_leaf_nodes': 'None', 'classifier:decision_tree:max_features': 1.0, 'one_hot_encoding:minimum_fraction': 0.29966862109739095, 'one_hot_encoding:use_minimum_fraction': 'True', 'classifier:decision_tree:min_samples_leaf': 14, 'imputation:strategy': 'median'},
dataset_properties={
  'target_type': 'classification',
  'multiclass': False,
  'signed': False,
  'sparse': False,
  'task': 1,
  'multilabel': False})),
(0.040000, SimpleClassificationPipeline({'preprocessor:feature_agglomeration:pooling_func': 'mean', 'preprocessor:feature_agglomeration:affinity': 'manhattan', 'classifier:random_forest:max_depth': 'None', 'classifier:random_forest:criterion': 'entropy', 'rescaling:__choice__': 'none', 'classifier:random_forest:n_estimators': 100, 'classifier:__choice__': 'random_forest', 'one_hot_encoding:use_minimum_fraction': 'False', 'classifier:random_forest:max_features': 3.517874457919551, 'preprocessor:feature_agglomeration:n_clusters': 338, 'balancing:strategy': 'none', 'classifier:random_forest:bootstrap': 'False', 'classifier:random_forest:min_samples_leaf': 1, 'classifier:random_forest:min_samples_split': 8, 'classifier:random_forest:min_weight_fraction_leaf': 0.0, 'classifier:random_forest:max_leaf_nodes': 'None', 'preprocessor:__choice__': 'feature_agglomeration', 'preprocessor:feature_agglomeration:linkage': 'complete', 'imputation:strategy': 'most_frequent'},
dataset_properties={
  'target_type': 'classification',
  'multiclass': False,
  'signed': False,
  'sparse': False,
  'task': 1,
  'multilabel': False})),
(0.040000, SimpleClassificationPipeline({'classifier:adaboost:algorithm': 'SAMME.R', 'classifier:adaboost:max_depth': 8, 'one_hot_encoding:minimum_fraction': 0.0010608268861776148, 'balancing:strategy': 'none', 'classifier:adaboost:learning_rate': 0.1389245240873958, 'classifier:adaboost:n_estimators': 361, 'preprocessor:__choice__': 'no_preprocessing', 'rescaling:__choice__': 'none', 'one_hot_encoding:use_minimum_fraction': 'True', 'classifier:__choice__': 'adaboost', 'imputation:strategy': 'median'},
dataset_properties={
  'target_type': 'classification',
  'multiclass': False,
  'signed': False,
  'sparse': False,
  'task': 1,
  'multilabel': False})),
(0.040000, SimpleClassificationPipeline({'classifier:adaboost:algorithm': 'SAMME', 'preprocessor:fast_ica:fun': 'exp', 'preprocessor:fast_ica:algorithm': 'deflation', 'preprocessor:__choice__': 'fast_ica', 'rescaling:__choice__': 'minmax', 'classifier:__choice__': 'adaboost', 'classifier:adaboost:n_estimators': 432, 'classifier:adaboost:max_depth': 10, 'balancing:strategy': 'none', 'one_hot_encoding:use_minimum_fraction': 'False', 'preprocessor:fast_ica:n_components': 226, 'preprocessor:fast_ica:whiten': 'True', 'classifier:adaboost:learning_rate': 0.1335260976284136, 'imputation:strategy': 'median'},
dataset_properties={
  'target_type': 'classification',
  'multiclass': False,
  'signed': False,
  'sparse': False,
  'task': 1,
  'multilabel': False})),
(0.020000, SimpleClassificationPipeline({'one_hot_encoding:minimum_fraction': 0.01, 'classifier:random_forest:max_depth': 'None', 'classifier:random_forest:criterion': 'gini', 'rescaling:__choice__': 'standardize', 'classifier:random_forest:n_estimators': 100, 'classifier:__choice__': 'random_forest', 'one_hot_encoding:use_minimum_fraction': 'True', 'classifier:random_forest:max_features': 1.0, 'balancing:strategy': 'none', 'classifier:random_forest:bootstrap': 'True', 'classifier:random_forest:min_samples_leaf': 1, 'classifier:random_forest:min_samples_split': 2, 'classifier:random_forest:min_weight_fraction_leaf': 0.0, 'classifier:random_forest:max_leaf_nodes': 'None', 'preprocessor:__choice__': 'no_preprocessing', 'imputation:strategy': 'mean'},
dataset_properties={
  'target_type': 'classification',
  'multiclass': False,
  'signed': False,
  'sparse': False,
  'task': 1,
  'multilabel': False})),
(0.020000, SimpleClassificationPipeline({'one_hot_encoding:minimum_fraction': 0.010000000000000004, 'preprocessor:feature_agglomeration:affinity': 'euclidean', 'classifier:random_forest:max_depth': 'None', 'preprocessor:feature_agglomeration:pooling_func': 'mean', 'classifier:random_forest:criterion': 'gini', 'rescaling:__choice__': 'normalize', 'classifier:random_forest:n_estimators': 100, 'classifier:__choice__': 'random_forest', 'one_hot_encoding:use_minimum_fraction': 'True', 'classifier:random_forest:max_features': 1.0, 'preprocessor:feature_agglomeration:n_clusters': 374, 'balancing:strategy': 'none', 'classifier:random_forest:bootstrap': 'True', 'classifier:random_forest:min_samples_leaf': 1, 'classifier:random_forest:min_samples_split': 2, 'classifier:random_forest:min_weight_fraction_leaf': 0.0, 'classifier:random_forest:max_leaf_nodes': 'None', 'preprocessor:__choice__': 'feature_agglomeration', 'preprocessor:feature_agglomeration:linkage': 'ward', 'imputation:strategy': 'median'},
dataset_properties={
  'target_type': 'classification',
  'multiclass': False,
  'signed': False,
  'sparse': False,
  'task': 1,
  'multilabel': False})),
(0.020000, SimpleClassificationPipeline({'preprocessor:fast_ica:fun': 'logcosh', 'preprocessor:fast_ica:algorithm': 'deflation', 'classifier:k_nearest_neighbors:n_neighbors': 6, 'preprocessor:__choice__': 'fast_ica', 'rescaling:__choice__': 'minmax', 'classifier:__choice__': 'k_nearest_neighbors', 'classifier:k_nearest_neighbors:weights': 'distance', 'balancing:strategy': 'none', 'classifier:k_nearest_neighbors:p': 2, 'preprocessor:fast_ica:whiten': 'True', 'preprocessor:fast_ica:n_components': 1951, 'one_hot_encoding:use_minimum_fraction': 'False', 'imputation:strategy': 'mean'},
dataset_properties={
  'target_type': 'classification',
  'multiclass': False,
  'signed': False,
  'sparse': False,
  'task': 1,
  'multilabel': False})),
(0.020000, SimpleClassificationPipeline({'classifier:extra_trees:min_weight_fraction_leaf': 0.0, 'classifier:extra_trees:max_features': 1.0, 'classifier:extra_trees:criterion': 'gini', 'preprocessor:select_rates:score_func': 'chi2', 'classifier:extra_trees:min_samples_split': 2, 'preprocessor:__choice__': 'select_rates', 'rescaling:__choice__': 'none', 'classifier:extra_trees:n_estimators': 100, 'classifier:extra_trees:min_samples_leaf': 1, 'classifier:extra_trees:max_depth': 'None', 'classifier:__choice__': 'extra_trees', 'balancing:strategy': 'none', 'classifier:extra_trees:bootstrap': 'False', 'preprocessor:select_rates:mode': 'fpr', 'preprocessor:select_rates:alpha': 0.1, 'one_hot_encoding:use_minimum_fraction': 'False', 'imputation:strategy': 'mean'},
dataset_properties={
  'target_type': 'classification',
  'multiclass': False,
  'signed': False,
  'sparse': False,
  'task': 1,
  'multilabel': False})),
(0.020000, SimpleClassificationPipeline({'one_hot_encoding:minimum_fraction': 0.34110222241136745, 'classifier:random_forest:max_depth': 'None', 'classifier:random_forest:criterion': 'gini', 'rescaling:__choice__': 'none', 'classifier:random_forest:n_estimators': 100, 'classifier:__choice__': 'random_forest', 'one_hot_encoding:use_minimum_fraction': 'True', 'classifier:random_forest:max_features': 0.9213917629406125, 'balancing:strategy': 'none', 'classifier:random_forest:bootstrap': 'False', 'classifier:random_forest:min_samples_leaf': 5, 'classifier:random_forest:min_samples_split': 7, 'classifier:random_forest:min_weight_fraction_leaf': 0.0, 'classifier:random_forest:max_leaf_nodes': 'None', 'preprocessor:__choice__': 'no_preprocessing', 'imputation:strategy': 'mean'},
dataset_properties={
  'target_type': 'classification',
  'multiclass': False,
  'signed': False,
  'sparse': False,
  'task': 1,
  'multilabel': False})),
(0.020000, SimpleClassificationPipeline({'preprocessor:polynomial:degree': 2, 'classifier:random_forest:max_depth': 'None', 'preprocessor:polynomial:include_bias': 'False', 'classifier:random_forest:criterion': 'entropy', 'rescaling:__choice__': 'none', 'classifier:random_forest:n_estimators': 100, 'classifier:__choice__': 'random_forest', 'one_hot_encoding:use_minimum_fraction': 'False', 'classifier:random_forest:max_features': 1.0, 'balancing:strategy': 'none', 'classifier:random_forest:bootstrap': 'True', 'preprocessor:polynomial:interaction_only': 'False', 'classifier:random_forest:min_samples_leaf': 1, 'classifier:random_forest:min_samples_split': 2, 'classifier:random_forest:min_weight_fraction_leaf': 0.0, 'classifier:random_forest:max_leaf_nodes': 'None', 'preprocessor:__choice__': 'polynomial', 'imputation:strategy': 'mean'},
dataset_properties={
  'target_type': 'classification',
  'multiclass': False,
  'signed': False,
  'sparse': False,
  'task': 1,
  'multilabel': False})),
]

Thanks

mfeurer commented 7 years ago

That is indeed a weird problem, and from the top of my head I don't know where it could come from. @christophe-rannou could you try to provide me with a minimal working example which allows me to reproduce the error? Also, could you please post the output of either pip list or conda list? This is potentially related to #326.

christophe-rannou commented 7 years ago

Sadly I cannot provide you with the dataset and examples run without problem. This is the conda list:

argparse                  1.4.0                     <pip>
asn1crypto                0.22.0                   py36_0  
auto-sklearn              0.2.0                     <pip>
cffi                      1.10.0                   py36_0  
cloog                     0.18.0                        0  
conda                     4.3.22                   py36_0  
conda-env                 2.6.0                         0  
ConfigSpace               0.3.9                     <pip>
cryptography              1.8.1                    py36_0  
Cython                    0.25.2                    <pip>
decorator                 4.1.1                     <pip>
docutils                  0.13.1                    <pip>
future                    0.16.0                    <pip>
gcc                       4.8.5                         7  
gmp                       6.1.0                         0  
idna                      2.5                      py36_0  
isl                       0.12.2                        0  
joblib                    0.11                      <pip>
liac-arff                 2.1.1                     <pip>
libffi                    3.2.1                         1  
lockfile                  0.12.2                    <pip>
mpc                       1.0.3                         0  
mpfr                      3.1.5                         0  
networkx                  1.11                      <pip>
nose                      1.3.7                     <pip>
numpy                     1.13.1                    <pip>
openssl                   1.0.2l                        0  
packaging                 16.8                     py36_0  
pandas                    0.20.3                    <pip>
pcre                      8.39                          1  
pip                       9.0.1                    py36_1  
protobuf                  3.3.0                     <pip>
psutil                    5.2.2                     <pip>
pycosat                   0.6.2                    py36_0  
pycparser                 2.17                     py36_0  
pynisher                  0.4.2                     <pip>
pyopenssl                 17.0.0                   py36_0  
pyparsing                 2.1.4                    py36_0  
pyPhoenix                 0.10.0                    <pip>
pyrfr                     0.4.0                     <pip>
python                    3.6.1                         2  
python-dateutil           2.6.1                     <pip>
pytz                      2017.2                    <pip>
PyYAML                    3.12                      <pip>
readline                  6.2                           2  
requests                  2.14.2                   py36_0  
ruamel_yaml               0.11.14                  py36_1  
scikit-learn              0.18.1                    <pip>
scipy                     0.19.1                    <pip>
setuptools                27.2.0                   py36_0  
six                       1.10.0                   py36_0  
sklearn                   0.0                       <pip>
smac                      0.5.0                     <pip>
SQLAlchemy                1.1.11                    <pip>
sqlite                    3.13.0                        0  
swig                      3.0.10                        0  
tk                        8.5.18                        0  
typing                    3.6.1                     <pip>
wheel                     0.29.0                   py36_0  
xz                        5.2.2                         1  
yaml                      0.1.6                         0  
zlib                      1.2.8                         3  

I tried making a prediction without making a refit and I get the following error :

---------------------------------------------------------------------------
NotImplementedError                       Traceback (most recent call last)
<ipython-input-22-59975d76b47f> in <module>()
      2 # pos_proba = cls.predict_proba(X_crossval[val_set])[:,list(rfc.classes_).index(1)]
      3 # roc_auc_score(y_crossval[val_set], pos_proba)
----> 4 predictions = cls.predict(X_test)
      5 accuracy_score(y_test, predictions)

/usr/local/lib/python3.4/dist-packages/autosklearn/estimators.py in predict(self, X, batch_size, n_jobs)
    419         """
    420         return super(AutoSklearnClassifier, self).predict(
--> 421             X, batch_size=batch_size, n_jobs=n_jobs)
    422 
    423     def predict_proba(self, X, batch_size=None, n_jobs=1):

/usr/local/lib/python3.4/dist-packages/autosklearn/estimators.py in predict(self, X, batch_size, n_jobs)
     59 
     60     def predict(self, X, batch_size=None, n_jobs=1):
---> 61         return self._automl.predict(X, batch_size=batch_size, n_jobs=n_jobs)
     62 
     63     def score(self, X, y):

/usr/local/lib/python3.4/dist-packages/autosklearn/estimators.py in predict(self, X, batch_size, n_jobs)
    587     def predict(self, X, batch_size=None, n_jobs=1):
    588         predicted_probabilities = self._automl.predict(
--> 589             X, batch_size=batch_size, n_jobs=n_jobs)
    590 
    591         if self._n_outputs == 1:

/usr/local/lib/python3.4/dist-packages/autosklearn/automl.py in predict(self, X, batch_size, n_jobs)
    543         all_predictions = joblib.Parallel(n_jobs=n_jobs)(
    544             joblib.delayed(_model_predict)(self, X, batch_size, identifier)
--> 545             for identifier in self.ensemble_.get_model_identifiers())
    546 
    547         if len(all_predictions) == 0:

/usr/local/lib/python3.4/dist-packages/sklearn/externals/joblib/parallel.py in __call__(self, iterable)
    756             # was dispatched. In particular this covers the edge
    757             # case of Parallel used with an exhausted iterator.
--> 758             while self.dispatch_one_batch(iterator):
    759                 self._iterating = True
    760             else:

/usr/local/lib/python3.4/dist-packages/sklearn/externals/joblib/parallel.py in dispatch_one_batch(self, iterator)
    606                 return False
    607             else:
--> 608                 self._dispatch(tasks)
    609                 return True
    610 

/usr/local/lib/python3.4/dist-packages/sklearn/externals/joblib/parallel.py in _dispatch(self, batch)
    569         dispatch_timestamp = time.time()
    570         cb = BatchCompletionCallBack(dispatch_timestamp, len(batch), self)
--> 571         job = self._backend.apply_async(batch, callback=cb)
    572         self._jobs.append(job)
    573 

/usr/local/lib/python3.4/dist-packages/sklearn/externals/joblib/_parallel_backends.py in apply_async(self, func, callback)
    107     def apply_async(self, func, callback=None):
    108         """Schedule a func to be run"""
--> 109         result = ImmediateResult(func)
    110         if callback:
    111             callback(result)

/usr/local/lib/python3.4/dist-packages/sklearn/externals/joblib/_parallel_backends.py in __init__(self, batch)
    324         # Don't delay the application, to avoid keeping the input
    325         # arguments in memory
--> 326         self.results = batch()
    327 
    328     def get(self):

/usr/local/lib/python3.4/dist-packages/sklearn/externals/joblib/parallel.py in __call__(self)
    129 
    130     def __call__(self):
--> 131         return [func(*args, **kwargs) for func, args, kwargs in self.items]
    132 
    133     def __len__(self):

/usr/local/lib/python3.4/dist-packages/sklearn/externals/joblib/parallel.py in <listcomp>(.0)
    129 
    130     def __call__(self):
--> 131         return [func(*args, **kwargs) for func, args, kwargs in self.items]
    132 
    133     def __len__(self):

/usr/local/lib/python3.4/dist-packages/autosklearn/automl.py in _model_predict(self, X, batch_size, identifier)
     43             prediction = model.predict(X_, batch_size=batch_size)
     44         else:
---> 45             prediction = model.predict_proba(X_, batch_size=batch_size)
     46     if len(prediction.shape) < 1 or len(X_.shape) < 1 or \
     47             X_.shape[0] < 1 or prediction.shape[0] != X_.shape[0]:

/usr/local/lib/python3.4/dist-packages/autosklearn/pipeline/classification.py in predict_proba(self, X, batch_size)
    125                 Xt = transform.transform(Xt)
    126 
--> 127             return self.steps[-1][-1].predict_proba(Xt)
    128 
    129         else:

/usr/local/lib/python3.4/dist-packages/autosklearn/pipeline/components/classification/__init__.py in predict_proba(self, X)
    124 
    125     def predict_proba(self, X):
--> 126         return self.choice.predict_proba(X)
    127 
    128     def estimator_supports_iterative_fit(self):

/usr/local/lib/python3.4/dist-packages/autosklearn/pipeline/components/classification/gradient_boosting.py in predict_proba(self, X)
    112     def predict_proba(self, X):
    113         if self.estimator is None:
--> 114             raise NotImplementedError()
    115         return self.estimator.predict_proba(X)
    116 

NotImplementedError: 

Once again it is the gradient_boosting that raises an error. It definitely seems related to #326.

mfeurer commented 7 years ago

@christophe-rannou could you please check if the cause of #326 is the cause of your issue here?

christophe-rannou commented 7 years ago

Well I just tried again and executed the following code

import autosklearn.classification
import sklearn.model_selection
X_train, X_test, y_train, y_test = \
        sklearn.model_selection.train_test_split(X, y, random_state=1)
cls = autosklearn.classification.AutoSklearnClassifier()
cls.fit(X_train, y_train)
cls.refit(X, y)

I ended up with a similar error except this time it was not GradientBoosting

/root/miniconda3/envs/py24/lib/python3.4/site-packages/autosklearn/pipeline/components/feature_preprocessing/random_trees_embedding.py in fit(self, X, Y)
     32             self.max_leaf_nodes = None
     33         else:
---> 34             self.max_leaf_nodes = int(self.max_leaf_nodes)
     35 
     36         self.preprocessor = sklearn.ensemble.RandomTreesEmbedding(

TypeError: int() argument must be a string, a bytes-like object or a number, not 'NoneType'
mfeurer commented 7 years ago

Thanks for bringing this up again. Could you do me a favor and execute the following piece of code on your machine to see if it triggers the error (as it doesn't on mine, and it should be triggering the error according to your description of the problem)?

import sklearn.model_selection
import sklearn.datasets
import sklearn.metrics

import autosklearn.classification

def main():
    X, y = sklearn.datasets.load_iris(return_X_y=True)
    X_train, X_test, y_train, y_test = \
        sklearn.model_selection.train_test_split(X, y, random_state=1)

    automl = autosklearn.classification.AutoSklearnClassifier(
        time_left_for_this_task=30, per_run_time_limit=10,
        tmp_folder='/tmp/autoslearn_holdout_example_tmp',
        output_folder='/tmp/autosklearn_holdout_example_out',
        disable_evaluator_output=False,
        include_preprocessors=["random_trees_embedding"],
        include_estimators=["sgd"],
        delete_tmp_folder_after_terminate=False,
        resampling_strategy='cv')
    automl.fit(X_train, y_train, dataset_name='digits')
    automl.refit(X, y)

    # Print the final ensemble constructed by auto-sklearn.
    print(automl.show_models())
    predictions = automl.predict(X_test)
    # Print statistics about the auto-sklearn run such as number of
    # iterations, number of models failed with a time out.
    print(automl.sprint_statistics())
    print("Accuracy score", sklearn.metrics.accuracy_score(y_test, predictions))
    print(automl.cv_results_)

if __name__ == '__main__':
    main()

A quick fix would be that you change line 31 in your file to check if that parameter is None. However, this still doesn't explain the reasons of this failure.

mfeurer commented 6 years ago

This is a duplicate of #326. I posted the actual error there and will fix the issue with the next, upcoming release.