pycaret / pycaret

An open-source, low-code machine learning library in Python
https://www.pycaret.org
MIT License
8.83k stars 1.76k forks source link

[BUG]: groupkfold issue: Learning Curve plot in evaluate_model() returns error #3073

Closed Ali-Flt closed 1 year ago

Ali-Flt commented 1 year ago

pycaret version checks

Issue Description

After using GroupKFold strategy and training a model. When I want to evaluate it by seeing the Learning curve plot it returns error. Other plots may also be buggy and should be checked. (This has been checked after the latest PR #3057)

Reproducible Example

from pycaret.regression import *
s = setup(
          train_data,
          target=label,
          test_data=test_data,
          imputation_type=None,
          normalize=True,
          index=True,
          html=True,
          profile=False,
          verbose=True,
          session_id=(3964),
          feature_selection=False,
          feature_selection_method='sequential',
          n_features_to_select=20,
          polynomial_features=False,
          polynomial_degree=1,
          remove_multicollinearity=True,
          fold_shuffle=True,
          fold_strategy='groupkfold',
          fold_groups=train_data['groups'],
          fold=10,
          remove_outliers=False,
          transformation=False,
          ignore_features=['groups'],
          pca=True,
          pca_method='kernel',
          pca_components=30
          )

groups=train_data['groups']
search_library='optuna'
optimize='MAE'
tuner_verbose=0
bests = compare_models(turbo=False, n_select=4, sort=optimize, groups=groups)
leader_board = get_leaderboard()
best_model = automl(optimize=optimize, use_holdout=True)
evaluate_model(best_model, groups=groups)

Expected Behavior

Showing the Learning Curve plot

Actual Results

---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
File /opt/miniconda3/envs/pycaret_env_dev/lib/python3.8/site-packages/ipywidgets/widgets/interaction.py:239, in interactive.update(self, *args)
    237     value = widget.get_interact_value()
    238     self.kwargs[widget._kwarg] = value
--> 239 self.result = self.f(**self.kwargs)
    240 show_inline_matplotlib_plots()
    241 if self.auto_display and self.result is not None:

File /opt/miniconda3/envs/pycaret_env_dev/lib/python3.8/site-packages/pycaret/internal/pycaret_experiment/tabular_experiment.py:1960, in _TabularExperiment._plot_model(self, estimator, plot, scale, save, fold, fit_kwargs, plot_kwargs, groups, feature_name, label, use_train_data, verbose, system, display, display_format)
   1958 # execute the plot method
   1959 with redirect_output(self.logger):
-> 1960     ret = locals()[plot]()
   1961 if ret:
   1962     plot_filename = ret

File /opt/miniconda3/envs/pycaret_env_dev/lib/python3.8/site-packages/pycaret/internal/pycaret_experiment/tabular_experiment.py:1379, in _TabularExperiment._plot_model.<locals>.learning()
   1371 sizes = np.linspace(0.3, 1.0, 10)
   1372 visualizer = LearningCurve(
   1373     estimator,
   1374     cv=cv,
   (...)
   1377     random_state=self.seed,
   1378 )
-> 1379 return show_yellowbrick_plot(
   1380     visualizer=visualizer,
   1381     X_train=self.X_train_transformed,
   1382     y_train=self.y_train_transformed,
   1383     X_test=self.X_test_transformed,
   1384     y_test=self.y_test_transformed,
   1385     handle_test="",
   1386     name=plot_name,
   1387     scale=scale,
   1388     save=save,
   1389     fit_kwargs=fit_kwargs,
   1390     groups=groups,
   1391     display_format=display_format,
   1392 )

File /opt/miniconda3/envs/pycaret_env_dev/lib/python3.8/site-packages/pycaret/internal/plots/yellowbrick.py:88, in show_yellowbrick_plot(visualizer, X_train, y_train, X_test, y_test, name, handle_train, handle_test, scale, save, fit_kwargs, groups, display_format, **kwargs)
     86 elif handle_train == "fit":
     87     logger.info("Fitting Model")
---> 88     visualizer.fit(X_train, y_train, **fit_kwargs_and_kwargs)
     89 elif handle_train == "fit_transform":
     90     logger.info("Fitting & Transforming Model")

File /opt/miniconda3/envs/pycaret_env_dev/lib/python3.8/site-packages/yellowbrick/model_selection/learning_curve.py:249, in LearningCurve.fit(self, X, y)
    233 sklc_kwargs = {
    234     key: self.get_params()[key]
    235     for key in (
   (...)
    245     )
    246 }
    248 # compute the learning curve and store the scores on the estimator
--> 249 curve = sk_learning_curve(self.estimator, X, y, **sklc_kwargs)
    250 self.train_sizes_, self.train_scores_, self.test_scores_ = curve
    252 # compute the mean and standard deviation of the training data

File /opt/miniconda3/envs/pycaret_env_dev/lib/python3.8/site-packages/sklearn/model_selection/_validation.py:1513, in learning_curve(estimator, X, y, groups, train_sizes, cv, scoring, exploit_incremental_learning, n_jobs, pre_dispatch, verbose, shuffle, random_state, error_score, return_times, fit_params)
   1511 cv = check_cv(cv, y, classifier=is_classifier(estimator))
   1512 # Store it as list as we will be iterating over the list multiple times
-> 1513 cv_iter = list(cv.split(X, y, groups))
   1515 scorer = check_scoring(estimator, scoring=scoring)
   1517 n_max_training_samples = len(cv_iter[0][0])

File /opt/miniconda3/envs/pycaret_env_dev/lib/python3.8/site-packages/sklearn/model_selection/_split.py:340, in _BaseKFold.split(self, X, y, groups)
    332 if self.n_splits > n_samples:
    333     raise ValueError(
    334         (
    335             "Cannot have number of splits n_splits={0} greater"
    336             " than the number of samples: n_samples={1}."
    337         ).format(self.n_splits, n_samples)
    338     )
--> 340 for train, test in super().split(X, y, groups):
    341     yield train, test

File /opt/miniconda3/envs/pycaret_env_dev/lib/python3.8/site-packages/sklearn/model_selection/_split.py:86, in BaseCrossValidator.split(self, X, y, groups)
     84 X, y, groups = indexable(X, y, groups)
     85 indices = np.arange(_num_samples(X))
---> 86 for test_index in self._iter_test_masks(X, y, groups):
     87     train_index = indices[np.logical_not(test_index)]
     88     test_index = indices[test_index]

File /opt/miniconda3/envs/pycaret_env_dev/lib/python3.8/site-packages/sklearn/model_selection/_split.py:98, in BaseCrossValidator._iter_test_masks(self, X, y, groups)
     93 def _iter_test_masks(self, X=None, y=None, groups=None):
     94     """Generates boolean masks corresponding to test sets.
     95 
     96     By default, delegates to _iter_test_indices(X, y, groups)
     97     """
---> 98     for test_index in self._iter_test_indices(X, y, groups):
     99         test_mask = np.zeros(_num_samples(X), dtype=bool)
    100         test_mask[test_index] = True

File /opt/miniconda3/envs/pycaret_env_dev/lib/python3.8/site-packages/sklearn/model_selection/_split.py:518, in GroupKFold._iter_test_indices(self, X, y, groups)
    516 def _iter_test_indices(self, X, y, groups):
    517     if groups is None:
--> 518         raise ValueError("The 'groups' parameter should not be None.")
    519     groups = check_array(groups, input_name="groups", ensure_2d=False, dtype=None)
    521     unique_groups, groups = np.unique(groups, return_inverse=True)

ValueError: The 'groups' parameter should not be None.

Installed Versions

System: python: 3.8.13 | packaged by conda-forge | (default, Mar 25 2022, 06:04:18) [GCC 10.3.0] executable: /opt/miniconda3/envs/pycaret_env_dev/bin/python3.8 machine: Linux-6.0.2-arch1-1-x86_64-with-glibc2.10 PyCaret required dependencies: pip: 22.3.1 setuptools: 65.5.1 pycaret: 3.0.0rc4 IPython: 8.6.0 ipywidgets: 8.0.2 tqdm: 4.64.1 numpy: 1.22.4 pandas: 1.4.4 jinja2: 3.1.2 scipy: 1.8.1 joblib: 1.2.0 sklearn: 1.1.3 pyod: 1.0.6 imblearn: 0.9.1 category_encoders: 2.5.1.post0 lightgbm: 3.3.3 numba: 0.55.2 requests: 2.28.1 matplotlib: 3.6.2 scikitplot: 0.3.7 yellowbrick: 1.5 plotly: 5.11.0 kaleido: 0.2.1 statsmodels: 0.13.5 sktime: 0.13.2 tbats: 1.1.1 pmdarima: 1.8.5 psutil: 5.9.3 PyCaret optional dependencies: shap: Not installed interpret: Not installed umap: Not installed pandas_profiling: Not installed explainerdashboard: Not installed autoviz: Not installed fairlearn: Not installed xgboost: Not installed catboost: Not installed kmodes: Not installed mlxtend: Not installed statsforecast: Not installed tune_sklearn: 0.4.4 ray: 2.0.1 hyperopt: 0.2.7 optuna: 3.0.3 skopt: 0.9.0 mlflow: 1.30.0 gradio: 3.9 fastapi: 0.86.0 uvicorn: 0.19.0 m2cgen: 0.10.0 evidently: 0.1.59.dev3 nltk: Not installed pyLDAvis: Not installed gensim: Not installed spacy: Not installed wordcloud: Not installed textblob: Not installed fugue: Not installed streamlit: Not installed prophet: Not installed
moezali1 commented 1 year ago

@Ali-Flt Can you please upload reproducible Notebook along with the sample data?

moezali1 commented 1 year ago

@tvdboom

tvdboom commented 1 year ago

Thanks for pointing out this issue. I fixed it in #3101