pycaret / pycaret

An open-source, low-code machine learning library in Python
https://www.pycaret.org
MIT License
8.68k stars 1.74k forks source link

[BUG]: fix_imbalance_method does not work for SMOTENC #3947

Open chris-cutting opened 3 months ago

chris-cutting commented 3 months ago

pycaret version checks

Issue Description

To use SMOTENC to fix imbalance, one has to provide the list of categorical features. But PyCaret does not seem to understand :

_Invalid value for the strategy parameter, got smotenc(categoricalfeatures=[1,3,4,5,6,9]). Choose from: condensednearestneighbour, editednearestneighborus, repeatededitednearestneighbours, allknn, instancehardnessthreshold, nearmiss, neighbourhoodcleaningrule, onesidedselection, randomundersampler, tomeklinks, randomoversampler, smote, smotenc, smoten, adasyn, borderlinesmote, kmeanssmote, svmsmote, smoteenn, smotetomek.

Reproducible Example

from pycaret.datasets import get_data
from pycaret.classification import *
df=get_data('titanic')
s = setup(df, target = 'Survived', session_id = 123, ignore_features=['PassengerId','Name','Ticket','Cabin'], fix_imbalance=True, fix_imbalance_method='SMOTENC(categorical_features=[2,4,11])')

Expected Behavior

I would expect SMOTENC to be applied to the data to have additional points from the minority class.

Actual Results

---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
Cell In[173], line 3
      1 from pycaret.datasets import get_data
      2 df=get_data('titanic')
----> 3 s = setup(df, target = 'Survived', session_id = 123,ignore_features=['PassengerId','Name','Ticket','Cabin'],fix_imbalance=True,fix_imbalance_method='SMOTENC(categorical_features=[2,4,11])')

File c:\Users\ChristineCutting\AppData\Local\Programs\Python\Python310\lib\site-packages\pycaret\classification\functional.py:595, in setup(data, data_func, target, index, train_size, test_data, ordinal_features, numeric_features, categorical_features, date_features, text_features, ignore_features, keep_features, preprocess, create_date_columns, imputation_type, numeric_imputation, categorical_imputation, iterative_imputation_iters, numeric_iterative_imputer, categorical_iterative_imputer, text_features_method, max_encoding_ohe, encoding_method, rare_to_value, rare_value, polynomial_features, polynomial_degree, low_variance_threshold, group_features, drop_groups, remove_multicollinearity, multicollinearity_threshold, bin_numeric_features, remove_outliers, outliers_method, outliers_threshold, fix_imbalance, fix_imbalance_method, transformation, transformation_method, normalize, normalize_method, pca, pca_method, pca_components, feature_selection, feature_selection_method, feature_selection_estimator, n_features_to_select, custom_pipeline, custom_pipeline_position, data_split_shuffle, data_split_stratify, fold_strategy, fold, fold_shuffle, fold_groups, n_jobs, use_gpu, html, session_id, system_log, log_experiment, experiment_name, experiment_custom_tags, log_plots, log_profile, log_data, verbose, memory, profile, profile_kwargs)
    593 exp = _EXPERIMENT_CLASS()
    594 set_current_experiment(exp)
--> 595 return exp.setup(
    596     data=data,
    597     data_func=data_func,
    598     target=target,
    599     index=index,
    600     train_size=train_size,
    601     test_data=test_data,
    602     ordinal_features=ordinal_features,
    603     numeric_features=numeric_features,
    604     categorical_features=categorical_features,
    605     date_features=date_features,
    606     text_features=text_features,
    607     ignore_features=ignore_features,
    608     keep_features=keep_features,
    609     preprocess=preprocess,
    610     create_date_columns=create_date_columns,
    611     imputation_type=imputation_type,
    612     numeric_imputation=numeric_imputation,
    613     categorical_imputation=categorical_imputation,
    614     iterative_imputation_iters=iterative_imputation_iters,
    615     numeric_iterative_imputer=numeric_iterative_imputer,
    616     categorical_iterative_imputer=categorical_iterative_imputer,
    617     text_features_method=text_features_method,
    618     max_encoding_ohe=max_encoding_ohe,
    619     encoding_method=encoding_method,
    620     rare_to_value=rare_to_value,
    621     rare_value=rare_value,
    622     polynomial_features=polynomial_features,
    623     polynomial_degree=polynomial_degree,
    624     low_variance_threshold=low_variance_threshold,
    625     group_features=group_features,
    626     drop_groups=drop_groups,
    627     remove_multicollinearity=remove_multicollinearity,
    628     multicollinearity_threshold=multicollinearity_threshold,
    629     bin_numeric_features=bin_numeric_features,
    630     remove_outliers=remove_outliers,
    631     outliers_method=outliers_method,
    632     outliers_threshold=outliers_threshold,
    633     fix_imbalance=fix_imbalance,
    634     fix_imbalance_method=fix_imbalance_method,
    635     transformation=transformation,
    636     transformation_method=transformation_method,
    637     normalize=normalize,
    638     normalize_method=normalize_method,
    639     pca=pca,
    640     pca_method=pca_method,
    641     pca_components=pca_components,
    642     feature_selection=feature_selection,
    643     feature_selection_method=feature_selection_method,
    644     feature_selection_estimator=feature_selection_estimator,
    645     n_features_to_select=n_features_to_select,
    646     custom_pipeline=custom_pipeline,
    647     custom_pipeline_position=custom_pipeline_position,
    648     data_split_shuffle=data_split_shuffle,
    649     data_split_stratify=data_split_stratify,
    650     fold_strategy=fold_strategy,
    651     fold=fold,
    652     fold_shuffle=fold_shuffle,
    653     fold_groups=fold_groups,
    654     n_jobs=n_jobs,
    655     use_gpu=use_gpu,
    656     html=html,
    657     session_id=session_id,
    658     system_log=system_log,
    659     log_experiment=log_experiment,
    660     experiment_name=experiment_name,
    661     experiment_custom_tags=experiment_custom_tags,
    662     log_plots=log_plots,
    663     log_profile=log_profile,
    664     log_data=log_data,
    665     verbose=verbose,
    666     memory=memory,
    667     profile=profile,
    668     profile_kwargs=profile_kwargs,
    669 )

File c:\Users\ChristineCutting\AppData\Local\Programs\Python\Python310\lib\site-packages\pycaret\classification\oop.py:854, in ClassificationExperiment.setup(self, data, data_func, target, index, train_size, test_data, ordinal_features, numeric_features, categorical_features, date_features, text_features, ignore_features, keep_features, preprocess, create_date_columns, imputation_type, numeric_imputation, categorical_imputation, iterative_imputation_iters, numeric_iterative_imputer, categorical_iterative_imputer, text_features_method, max_encoding_ohe, encoding_method, rare_to_value, rare_value, polynomial_features, polynomial_degree, low_variance_threshold, group_features, drop_groups, remove_multicollinearity, multicollinearity_threshold, bin_numeric_features, remove_outliers, outliers_method, outliers_threshold, fix_imbalance, fix_imbalance_method, transformation, transformation_method, normalize, normalize_method, pca, pca_method, pca_components, feature_selection, feature_selection_method, feature_selection_estimator, n_features_to_select, custom_pipeline, custom_pipeline_position, data_split_shuffle, data_split_stratify, fold_strategy, fold, fold_shuffle, fold_groups, n_jobs, use_gpu, html, session_id, system_log, log_experiment, experiment_name, experiment_custom_tags, log_plots, log_profile, log_data, engine, verbose, memory, profile, profile_kwargs)
    852 # Balance the classes in the target column
    853 if fix_imbalance:
--> 854     self._balance(fix_imbalance_method, session_id)
    856 # Power transform the data to be more Gaussian-like
    857 if transformation:

File c:\Users\ChristineCutting\AppData\Local\Programs\Python\Python310\lib\site-packages\pycaret\internal\preprocess\preprocessor.py:856, in Preprocessor._balance(self, fix_imbalance_method, session_id)
    854 fix_imbalance_method = fix_imbalance_method.lower()
    855 if fix_imbalance_method not in strategies:
--> 856     raise ValueError(
    857         "Invalid value for the strategy parameter, got "
    858         f"{fix_imbalance_method}. Choose from: {', '.join(strategies)}."
    859     )
    860 try:
    861     balance_estimator = FixImbalancer(
    862         strategies[fix_imbalance_method](random_state=session_id)
    863     )

ValueError: Invalid value for the strategy parameter, got smotenc(categorical_features=[2,4,11]). Choose from: condensednearestneighbour, editednearestneighborus, repeatededitednearestneighbours, allknn, instancehardnessthreshold, nearmiss, neighbourhoodcleaningrule, onesidedselection, randomundersampler, tomeklinks, randomoversampler, smote, smotenc, smoten, adasyn, borderlinesmote, kmeanssmote, svmsmote, smoteenn, smotetomek.

Installed Versions

System: python: 3.10.0 (tags/v3.10.0:b494f59, Oct 4 2021, 19:00:18) [MSC v.1929 64 bit (AMD64)] executable: c:\Users\ChristineCutting\AppData\Local\Programs\Python\Python310\python.exe machine: Windows-10-10.0.22631-SP0

PyCaret required dependencies: pip: 24.0 setuptools: 57.4.0 pycaret: 3.2.0 IPython: 8.21.0 ipywidgets: 8.1.2 tqdm: 4.66.2 numpy: 1.25.2 pandas: 1.5.3 jinja2: 3.1.3 scipy: 1.10.1 joblib: 1.3.2 sklearn: 1.2.2 pyod: 1.1.3 imblearn: 0.12.0 category_encoders: 2.6.3 lightgbm: 4.3.0 numba: 0.58.1 requests: 2.31.0 matplotlib: 3.6.0 scikitplot: 0.3.7 yellowbrick: 1.5 plotly: 5.19.0 plotly-resampler: Not installed kaleido: 0.2.1 schemdraw: 0.15 statsmodels: 0.14.1 sktime: 0.21.1 tbats: 1.1.3 pmdarima: 2.0.4 psutil: 5.9.8 markupsafe: 2.1.5 pickle5: Not installed cloudpickle: 3.0.0 deprecation: 2.1.0 xxhash: 3.4.1 wurlitzer: Not installed

PyCaret optional dependencies: shap: 0.44.1 interpret: 0.5.1 umap: 0.5.5 ydata_profiling: 4.6.4 explainerdashboard: 0.4.5 autoviz: Not installed fairlearn: 0.7.0 deepchecks: Not installed xgboost: Not installed catboost: Not installed kmodes: Not installed mlxtend: Not installed statsforecast: Not installed tune_sklearn: Not installed ray: Not installed hyperopt: Not installed optuna: Not installed skopt: Not installed mlflow: Not installed gradio: Not installed fastapi: Not installed uvicorn: Not installed m2cgen: Not installed evidently: Not installed fugue: Not installed streamlit: Not installed prophet: Not installed

mariamonzon commented 2 weeks ago

I am also have the same problem:

File ~/Projects/piord-data-analysis/venv-eda/lib/python3.10/site-packages/pycaret/classification/oop.py:854, in ClassificationExperiment.setup(self, data, data_func, target, index, train_size, test_data, ordinal_features, numeric_features, categorical_features, date_features, text_features, ignore_features, keep_features, preprocess, create_date_columns, imputation_type, numeric_imputation, categorical_imputation, iterative_imputation_iters, numeric_iterative_imputer, categorical_iterative_imputer, text_features_method, max_encoding_ohe, encoding_method, rare_to_value, rare_value, polynomial_features, polynomial_degree, low_variance_threshold, group_features, drop_groups, remove_multicollinearity, multicollinearity_threshold, bin_numeric_features, remove_outliers, outliers_method, outliers_threshold, fix_imbalance, fix_imbalance_method, transformation, transformation_method, normalize, normalize_method, pca, pca_method, pca_components, feature_selection, feature_selection_method, feature_selection_estimator, n_features_to_select, custom_pipeline, custom_pipeline_position, data_split_shuffle, data_split_stratify, fold_strategy, fold, fold_shuffle, fold_groups, n_jobs, use_gpu, html, session_id, system_log, log_experiment, experiment_name, experiment_custom_tags, log_plots, log_profile, log_data, engine, verbose, memory, profile, profile_kwargs) 852 # Balance the classes in the target column 853 if fix_imbalance: --> 854 self._balance(fix_imbalance_method, session_id) 856 # Power transform the data to be more Gaussian-like 857 if transformation:

File ~/Projects/piord-data-analysis/venv-eda/lib/python3.10/site-packages/pycaret/internal/preprocess/preprocessor.py:867, in Preprocessor._balance(self, fix_imbalance_method, session_id) 863 balance_estimator = FixImbalancer( 864 strategiesfix_imbalance_method 865 ) 866 except TypeError: --> 867 balance_estimator = FixImbalancer(strategies[fix_imbalance_method]()) 868 elif not hasattr(fix_imbalance_method, "fit_resample"): 869 raise TypeError( 870 "Invalid value for the fix_imbalance_method parameter. " 871 "The provided value must be a imblearn estimator, got " 872 f"{fix_imbalance_method.class._name}." 873 )

TypeError: SMOTENC.init() missing 1 required positional argument: 'categorical_features'