EpistasisLab / tpot

A Python Automated Machine Learning tool that optimizes machine learning pipelines using genetic programming.
http://epistasislab.github.io/tpot/
GNU Lesser General Public License v3.0
9.57k stars 1.55k forks source link

TPOT NN example fails #1339

Open norandom opened 5 months ago

norandom commented 5 months ago

Hi,

I just ran the demo code for the NNs. I am not very familiar with the problem.

https://epistasislab.github.io/tpot/examples/#neural-network-classifier-using-tpot-nn

from tpot import TPOTClassifier
from sklearn.datasets import make_blobs
from sklearn.model_selection import train_test_split

X, y = make_blobs(n_samples=100, centers=2, n_features=3, random_state=42)
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.75, test_size=0.25)

clf = TPOTClassifier(config_dict='TPOT NN', template='Selector-Transformer-PytorchLRClassifier',
                     verbosity=2, population_size=10, generations=10)
clf.fit(X_train, y_train)
print(clf.score(X_test, y_test))
clf.export('tpot_nn_demo_pipeline.py')

Error trace:


Generation 1 - Current best internal CV score: -inf

---------------------------------------------------------------------------
AttributeError                            Traceback (most recent call last)
File ~/anaconda3/lib/python3.11/site-packages/tpot/base.py:817, in TPOTBase.fit(self, features, target, sample_weight, groups)
    816         warnings.simplefilter("ignore")
--> 817         self._pop, _ = eaMuPlusLambda(
    818             population=self._pop,
    819             toolbox=self._toolbox,
    820             mu=self.population_size,
    821             lambda_=self._lambda,
    822             cxpb=self.crossover_rate,
    823             mutpb=self.mutation_rate,
    824             ngen=self.generations,
    825             pbar=self._pbar,
    826             halloffame=self._pareto_front,
    827             verbose=self.verbosity,
    828             per_generation_function=self._check_periodic_pipeline,
    829             log_file=self.log_file_,
    830         )
    832 # Allow for certain exceptions to signal a premature fit() cancellation

File ~/anaconda3/lib/python3.11/site-packages/tpot/gp_deap.py:285, in eaMuPlusLambda(population, toolbox, mu, lambda_, cxpb, mutpb, ngen, pbar, stats, halloffame, verbose, per_generation_function, log_file)
    284 if per_generation_function is not None:
--> 285     per_generation_function(gen)
    287 # Update the statistics with the new population

File ~/anaconda3/lib/python3.11/site-packages/tpot/base.py:1183, in TPOTBase._check_periodic_pipeline(self, gen)
   1173 """If enough time has passed, save a new optimized pipeline. Currently used in the per generation hook in the optimization loop.
   1174 Parameters
   1175 ----------
   (...)
   1181 None
   1182 """
-> 1183 self._update_top_pipeline()
   1184 if self.periodic_checkpoint_folder is not None:

File ~/anaconda3/lib/python3.11/site-packages/tpot/base.py:925, in TPOTBase._update_top_pipeline(self)
    923 from sklearn.model_selection import cross_val_score
--> 925 cv_scores = cross_val_score(
    926     sklearn_pipeline,
    927     self.pretest_X,
    928     self.pretest_y,
    929     cv=self.cv,
    930     scoring=self.scoring_function,
    931     verbose=0,
    932     error_score="raise",
    933 )
    934 break

File ~/anaconda3/lib/python3.11/site-packages/sklearn/model_selection/_validation.py:562, in cross_val_score(estimator, X, y, groups, scoring, cv, n_jobs, verbose, fit_params, pre_dispatch, error_score)
    560 scorer = check_scoring(estimator, scoring=scoring)
--> 562 cv_results = cross_validate(
    563     estimator=estimator,
    564     X=X,
    565     y=y,
    566     groups=groups,
    567     scoring={"score": scorer},
    568     cv=cv,
    569     n_jobs=n_jobs,
    570     verbose=verbose,
    571     fit_params=fit_params,
    572     pre_dispatch=pre_dispatch,
    573     error_score=error_score,
    574 )
    575 return cv_results["test_score"]

File ~/anaconda3/lib/python3.11/site-packages/sklearn/utils/_param_validation.py:211, in validate_params.<locals>.decorator.<locals>.wrapper(*args, **kwargs)
    206     with config_context(
    207         skip_parameter_validation=(
    208             prefer_skip_nested_validation or global_skip_validation
    209         )
    210     ):
--> 211         return func(*args, **kwargs)
    212 except InvalidParameterError as e:
    213     # When the function is just a wrapper around an estimator, we allow
    214     # the function to delegate validation to the estimator, but we replace
    215     # the name of the estimator by the name of the function in the error
    216     # message to avoid confusion.

File ~/anaconda3/lib/python3.11/site-packages/sklearn/model_selection/_validation.py:309, in cross_validate(estimator, X, y, groups, scoring, cv, n_jobs, verbose, fit_params, pre_dispatch, return_train_score, return_estimator, return_indices, error_score)
    308 parallel = Parallel(n_jobs=n_jobs, verbose=verbose, pre_dispatch=pre_dispatch)
--> 309 results = parallel(
    310     delayed(_fit_and_score)(
    311         clone(estimator),
    312         X,
    313         y,
    314         scorers,
    315         train,
    316         test,
    317         verbose,
    318         None,
    319         fit_params,
    320         return_train_score=return_train_score,
    321         return_times=True,
    322         return_estimator=return_estimator,
    323         error_score=error_score,
    324     )
    325     for train, test in indices
    326 )
    328 _warn_or_raise_about_fit_failures(results, error_score)

File ~/anaconda3/lib/python3.11/site-packages/sklearn/utils/parallel.py:65, in Parallel.__call__(self, iterable)
     61 iterable_with_config = (
     62     (_with_config(delayed_func, config), args, kwargs)
     63     for delayed_func, args, kwargs in iterable
     64 )
---> 65 return super().__call__(iterable_with_config)

File ~/anaconda3/lib/python3.11/site-packages/joblib/parallel.py:1085, in Parallel.__call__(self, iterable)
   1084 self._iterating = False
-> 1085 if self.dispatch_one_batch(iterator):
   1086     self._iterating = self._original_iterator is not None

File ~/anaconda3/lib/python3.11/site-packages/joblib/parallel.py:901, in Parallel.dispatch_one_batch(self, iterator)
    900 else:
--> 901     self._dispatch(tasks)
    902     return True

File ~/anaconda3/lib/python3.11/site-packages/joblib/parallel.py:819, in Parallel._dispatch(self, batch)
    818 job_idx = len(self._jobs)
--> 819 job = self._backend.apply_async(batch, callback=cb)
    820 # A job can complete so quickly than its callback is
    821 # called before we get here, causing self._jobs to
    822 # grow. To ensure correct results ordering, .insert is
    823 # used (rather than .append) in the following line

File ~/anaconda3/lib/python3.11/site-packages/joblib/_parallel_backends.py:208, in SequentialBackend.apply_async(self, func, callback)
    207 """Schedule a func to be run"""
--> 208 result = ImmediateResult(func)
    209 if callback:

File ~/anaconda3/lib/python3.11/site-packages/joblib/_parallel_backends.py:597, in ImmediateResult.__init__(self, batch)
    594 def __init__(self, batch):
    595     # Don't delay the application, to avoid keeping the input
    596     # arguments in memory
--> 597     self.results = batch()

File ~/anaconda3/lib/python3.11/site-packages/joblib/parallel.py:288, in BatchedCalls.__call__(self)
    287 with parallel_backend(self._backend, n_jobs=self._n_jobs):
--> 288     return [func(*args, **kwargs)
    289             for func, args, kwargs in self.items]

File ~/anaconda3/lib/python3.11/site-packages/joblib/parallel.py:288, in <listcomp>(.0)
    287 with parallel_backend(self._backend, n_jobs=self._n_jobs):
--> 288     return [func(*args, **kwargs)
    289             for func, args, kwargs in self.items]

File ~/anaconda3/lib/python3.11/site-packages/sklearn/utils/parallel.py:127, in _FuncWrapper.__call__(self, *args, **kwargs)
    126 with config_context(**config):
--> 127     return self.function(*args, **kwargs)

File ~/anaconda3/lib/python3.11/site-packages/sklearn/model_selection/_validation.py:754, in _fit_and_score(estimator, X, y, scorer, train, test, verbose, parameters, fit_params, return_train_score, return_parameters, return_n_test_samples, return_times, return_estimator, split_progress, candidate_progress, error_score)
    753 fit_time = time.time() - start_time
--> 754 test_scores = _score(estimator, X_test, y_test, scorer, error_score)
    755 score_time = time.time() - start_time - fit_time

File ~/anaconda3/lib/python3.11/site-packages/sklearn/model_selection/_validation.py:813, in _score(estimator, X_test, y_test, scorer, error_score)
    812     else:
--> 813         scores = scorer(estimator, X_test, y_test)
    814 except Exception:

File ~/anaconda3/lib/python3.11/site-packages/sklearn/metrics/_scorer.py:144, in _MultimetricScorer.__call__(self, estimator, *args, **kwargs)
    143 if self._raise_exc:
--> 144     raise e
    145 else:

File ~/anaconda3/lib/python3.11/site-packages/sklearn/metrics/_scorer.py:136, in _MultimetricScorer.__call__(self, estimator, *args, **kwargs)
    135 if isinstance(scorer, _BaseScorer):
--> 136     score = scorer._score(
    137         cached_call, estimator, *args, **routed_params.get(name).score
    138     )
    139 else:

File ~/anaconda3/lib/python3.11/site-packages/sklearn/metrics/_scorer.py:353, in _PredictScorer._score(self, method_caller, estimator, X, y_true, **kwargs)
    345 self._warn_overlap(
    346     message=(
    347         "There is an overlap between set kwargs of this scorer instance and"
   (...)
    351     kwargs=kwargs,
    352 )
--> 353 y_pred = method_caller(estimator, "predict", X)
    354 scoring_kwargs = {**self._kwargs, **kwargs}

File ~/anaconda3/lib/python3.11/site-packages/sklearn/metrics/_scorer.py:86, in _cached_call(cache, estimator, response_method, *args, **kwargs)
     84     return cache[response_method]
---> 86 result, _ = _get_response_values(
     87     estimator, *args, response_method=response_method, **kwargs
     88 )
     90 if cache is not None:

File ~/anaconda3/lib/python3.11/site-packages/sklearn/utils/_response.py:74, in _get_response_values(estimator, X, response_method, pos_label)
     73 prediction_method = _check_response_method(estimator, response_method)
---> 74 classes = estimator.classes_
     75 target_type = "binary" if len(classes) <= 2 else "multiclass"

File ~/anaconda3/lib/python3.11/site-packages/sklearn/pipeline.py:758, in Pipeline.classes_(self)
    757 """The classes labels. Only exist if the last step is a classifier."""
--> 758 return self.steps[-1][1].classes_

AttributeError: 'PytorchLRClassifier' object has no attribute 'classes_'

During handling of the above exception, another exception occurred:

AttributeError                            Traceback (most recent call last)
Cell In[2], line 10
      6 X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.75, test_size=0.25)
      8 clf = TPOTClassifier(config_dict='TPOT NN', template='Selector-Transformer-PytorchLRClassifier',
      9                      verbosity=2, population_size=10, generations=10)
---> 10 clf.fit(X_train, y_train)
     11 print(clf.score(X_test, y_test))
     12 clf.export('tpot_nn_demo_pipeline.py')

File ~/anaconda3/lib/python3.11/site-packages/tpot/base.py:864, in TPOTBase.fit(self, features, target, sample_weight, groups)
    861     except (KeyboardInterrupt, SystemExit, Exception) as e:
    862         # raise the exception if it's our last attempt
    863         if attempt == (attempts - 1):
--> 864             raise e
    865 return self

File ~/anaconda3/lib/python3.11/site-packages/tpot/base.py:855, in TPOTBase.fit(self, features, target, sample_weight, groups)
    852 if not isinstance(self._pbar, type(None)):
    853     self._pbar.close()
--> 855 self._update_top_pipeline()
    856 self._summary_of_best_pipeline(features, target)
    857 # Delete the temporary cache before exiting

File ~/anaconda3/lib/python3.11/site-packages/tpot/base.py:925, in TPOTBase._update_top_pipeline(self)
    922         sklearn_pipeline = self._toolbox.compile(expr=pipeline)
    923         from sklearn.model_selection import cross_val_score
--> 925         cv_scores = cross_val_score(
    926             sklearn_pipeline,
    927             self.pretest_X,
    928             self.pretest_y,
    929             cv=self.cv,
    930             scoring=self.scoring_function,
    931             verbose=0,
    932             error_score="raise",
    933         )
    934         break
    935 raise RuntimeError(
    936     "There was an error in the TPOT optimization "
    937     "process. This could be because the data was "
   (...)
    945     "https://epistasislab.github.io/tpot/using/"
    946 )

File ~/anaconda3/lib/python3.11/site-packages/sklearn/model_selection/_validation.py:562, in cross_val_score(estimator, X, y, groups, scoring, cv, n_jobs, verbose, fit_params, pre_dispatch, error_score)
    559 # To ensure multimetric format is not supported
    560 scorer = check_scoring(estimator, scoring=scoring)
--> 562 cv_results = cross_validate(
    563     estimator=estimator,
    564     X=X,
    565     y=y,
    566     groups=groups,
    567     scoring={"score": scorer},
    568     cv=cv,
    569     n_jobs=n_jobs,
    570     verbose=verbose,
    571     fit_params=fit_params,
    572     pre_dispatch=pre_dispatch,
    573     error_score=error_score,
    574 )
    575 return cv_results["test_score"]

File ~/anaconda3/lib/python3.11/site-packages/sklearn/utils/_param_validation.py:211, in validate_params.<locals>.decorator.<locals>.wrapper(*args, **kwargs)
    205 try:
    206     with config_context(
    207         skip_parameter_validation=(
    208             prefer_skip_nested_validation or global_skip_validation
    209         )
    210     ):
--> 211         return func(*args, **kwargs)
    212 except InvalidParameterError as e:
    213     # When the function is just a wrapper around an estimator, we allow
    214     # the function to delegate validation to the estimator, but we replace
    215     # the name of the estimator by the name of the function in the error
    216     # message to avoid confusion.
    217     msg = re.sub(
    218         r"parameter of \w+ must be",
    219         f"parameter of {func.__qualname__} must be",
    220         str(e),
    221     )

File ~/anaconda3/lib/python3.11/site-packages/sklearn/model_selection/_validation.py:309, in cross_validate(estimator, X, y, groups, scoring, cv, n_jobs, verbose, fit_params, pre_dispatch, return_train_score, return_estimator, return_indices, error_score)
    306 # We clone the estimator to make sure that all the folds are
    307 # independent, and that it is pickle-able.
    308 parallel = Parallel(n_jobs=n_jobs, verbose=verbose, pre_dispatch=pre_dispatch)
--> 309 results = parallel(
    310     delayed(_fit_and_score)(
    311         clone(estimator),
    312         X,
    313         y,
    314         scorers,
    315         train,
    316         test,
    317         verbose,
    318         None,
    319         fit_params,
    320         return_train_score=return_train_score,
    321         return_times=True,
    322         return_estimator=return_estimator,
    323         error_score=error_score,
    324     )
    325     for train, test in indices
    326 )
    328 _warn_or_raise_about_fit_failures(results, error_score)
    330 # For callable scoring, the return type is only know after calling. If the
    331 # return type is a dictionary, the error scores can now be inserted with
    332 # the correct key.

File ~/anaconda3/lib/python3.11/site-packages/sklearn/utils/parallel.py:65, in Parallel.__call__(self, iterable)
     60 config = get_config()
     61 iterable_with_config = (
     62     (_with_config(delayed_func, config), args, kwargs)
     63     for delayed_func, args, kwargs in iterable
     64 )
---> 65 return super().__call__(iterable_with_config)

File ~/anaconda3/lib/python3.11/site-packages/joblib/parallel.py:1085, in Parallel.__call__(self, iterable)
   1076 try:
   1077     # Only set self._iterating to True if at least a batch
   1078     # was dispatched. In particular this covers the edge
   (...)
   1082     # was very quick and its callback already dispatched all the
   1083     # remaining jobs.
   1084     self._iterating = False
-> 1085     if self.dispatch_one_batch(iterator):
   1086         self._iterating = self._original_iterator is not None
   1088     while self.dispatch_one_batch(iterator):

File ~/anaconda3/lib/python3.11/site-packages/joblib/parallel.py:901, in Parallel.dispatch_one_batch(self, iterator)
    899     return False
    900 else:
--> 901     self._dispatch(tasks)
    902     return True

File ~/anaconda3/lib/python3.11/site-packages/joblib/parallel.py:819, in Parallel._dispatch(self, batch)
    817 with self._lock:
    818     job_idx = len(self._jobs)
--> 819     job = self._backend.apply_async(batch, callback=cb)
    820     # A job can complete so quickly than its callback is
    821     # called before we get here, causing self._jobs to
    822     # grow. To ensure correct results ordering, .insert is
    823     # used (rather than .append) in the following line
    824     self._jobs.insert(job_idx, job)

File ~/anaconda3/lib/python3.11/site-packages/joblib/_parallel_backends.py:208, in SequentialBackend.apply_async(self, func, callback)
    206 def apply_async(self, func, callback=None):
    207     """Schedule a func to be run"""
--> 208     result = ImmediateResult(func)
    209     if callback:
    210         callback(result)

File ~/anaconda3/lib/python3.11/site-packages/joblib/_parallel_backends.py:597, in ImmediateResult.__init__(self, batch)
    594 def __init__(self, batch):
    595     # Don't delay the application, to avoid keeping the input
    596     # arguments in memory
--> 597     self.results = batch()

File ~/anaconda3/lib/python3.11/site-packages/joblib/parallel.py:288, in BatchedCalls.__call__(self)
    284 def __call__(self):
    285     # Set the default nested backend to self._backend but do not set the
    286     # change the default number of processes to -1
    287     with parallel_backend(self._backend, n_jobs=self._n_jobs):
--> 288         return [func(*args, **kwargs)
    289                 for func, args, kwargs in self.items]

File ~/anaconda3/lib/python3.11/site-packages/joblib/parallel.py:288, in <listcomp>(.0)
    284 def __call__(self):
    285     # Set the default nested backend to self._backend but do not set the
    286     # change the default number of processes to -1
    287     with parallel_backend(self._backend, n_jobs=self._n_jobs):
--> 288         return [func(*args, **kwargs)
    289                 for func, args, kwargs in self.items]

File ~/anaconda3/lib/python3.11/site-packages/sklearn/utils/parallel.py:127, in _FuncWrapper.__call__(self, *args, **kwargs)
    125     config = {}
    126 with config_context(**config):
--> 127     return self.function(*args, **kwargs)

File ~/anaconda3/lib/python3.11/site-packages/sklearn/model_selection/_validation.py:754, in _fit_and_score(estimator, X, y, scorer, train, test, verbose, parameters, fit_params, return_train_score, return_parameters, return_n_test_samples, return_times, return_estimator, split_progress, candidate_progress, error_score)
    751 result["fit_error"] = None
    753 fit_time = time.time() - start_time
--> 754 test_scores = _score(estimator, X_test, y_test, scorer, error_score)
    755 score_time = time.time() - start_time - fit_time
    756 if return_train_score:

File ~/anaconda3/lib/python3.11/site-packages/sklearn/model_selection/_validation.py:813, in _score(estimator, X_test, y_test, scorer, error_score)
    811         scores = scorer(estimator, X_test)
    812     else:
--> 813         scores = scorer(estimator, X_test, y_test)
    814 except Exception:
    815     if isinstance(scorer, _MultimetricScorer):
    816         # If `_MultimetricScorer` raises exception, the `error_score`
    817         # parameter is equal to "raise".

File ~/anaconda3/lib/python3.11/site-packages/sklearn/metrics/_scorer.py:144, in _MultimetricScorer.__call__(self, estimator, *args, **kwargs)
    142 except Exception as e:
    143     if self._raise_exc:
--> 144         raise e
    145     else:
    146         scores[name] = format_exc()

File ~/anaconda3/lib/python3.11/site-packages/sklearn/metrics/_scorer.py:136, in _MultimetricScorer.__call__(self, estimator, *args, **kwargs)
    134 try:
    135     if isinstance(scorer, _BaseScorer):
--> 136         score = scorer._score(
    137             cached_call, estimator, *args, **routed_params.get(name).score
    138         )
    139     else:
    140         score = scorer(estimator, *args, **routed_params.get(name).score)

File ~/anaconda3/lib/python3.11/site-packages/sklearn/metrics/_scorer.py:353, in _PredictScorer._score(self, method_caller, estimator, X, y_true, **kwargs)
    316 """Evaluate predicted target values for X relative to y_true.
    317 
    318 Parameters
   (...)
    343     Score function applied to prediction of estimator on X.
    344 """
    345 self._warn_overlap(
    346     message=(
    347         "There is an overlap between set kwargs of this scorer instance and"
   (...)
    351     kwargs=kwargs,
    352 )
--> 353 y_pred = method_caller(estimator, "predict", X)
    354 scoring_kwargs = {**self._kwargs, **kwargs}
    355 return self._sign * self._score_func(y_true, y_pred, **scoring_kwargs)

File ~/anaconda3/lib/python3.11/site-packages/sklearn/metrics/_scorer.py:86, in _cached_call(cache, estimator, response_method, *args, **kwargs)
     83 if cache is not None and response_method in cache:
     84     return cache[response_method]
---> 86 result, _ = _get_response_values(
     87     estimator, *args, response_method=response_method, **kwargs
     88 )
     90 if cache is not None:
     91     cache[response_method] = result

File ~/anaconda3/lib/python3.11/site-packages/sklearn/utils/_response.py:74, in _get_response_values(estimator, X, response_method, pos_label)
     72 if is_classifier(estimator):
     73     prediction_method = _check_response_method(estimator, response_method)
---> 74     classes = estimator.classes_
     75     target_type = "binary" if len(classes) <= 2 else "multiclass"
     77     if pos_label is not None and pos_label not in classes.tolist():

File ~/anaconda3/lib/python3.11/site-packages/sklearn/pipeline.py:758, in Pipeline.classes_(self)
    755 @property
    756 def classes_(self):
    757     """The classes labels. Only exist if the last step is a classifier."""
--> 758     return self.steps[-1][1].classes_

AttributeError: 'PytorchLRClassifier' object has no attribute 'classes_'

Context of the issue

Copied and pasted code into Jupy notebook.

Python 3.11.5 Anaconda

tpot 0.12.1 pytorch 3.11_cpu_0

Process to reproduce the issue

[ordered list the process to finding and recreating the issue, example below]

  1. User creates TPOT instance
  2. User calls TPOT TPOTClassifier(config_dict='TPOT NN', function with sample code from documentation
  3. TPOT crashes with a AttributeError: 'PytorchLRClassifier' object has no attribute 'classes_' at first generation

Expected result

This example is somewhat trivial, but it should result in nearly 100% classification accuracy.

Current result

Crash

Possible fix

PytorchLRClassifier call probably changed?

Demo code NN issue screenshot

Screenshot 2024-01-15 at 18 35 56