When using the setup function with the feature_selection=True parameter in Pycaret 3.3.2, this error is returned.
"ValueError: The feature names should match those that were passed during fit."
Reproducible Example
## At the Anaconda command line, run the following commands
conda create -n pycaret_332_310_2 python=3.10
# Press y when prompted
conda activate pycaret_332_310_2
pip install pycaret[full] ipykernel
python -m ipykernel install --user --name=pycaret_332_310_2
jupyter notebook
## Then open a new notebook, select the pycaret_332_310_2 kernel and run the following code.
from pycaret.datasets import get_data
data = get_data('diabetes')
from pycaret.classification import *
s = setup(data, target = 'Class variable', session_id = 123,
feature_selection=True)
Expected Behavior
I expect the code to return a data table and a successful setup screen, which is what I get when I omit the feature_selection=True parameter.
Actual Results
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
Cell In[2], line 5
2 data = get_data('diabetes')
4 from pycaret.classification import *
----> 5 s = setup(data, target = 'Class variable', session_id = 123
6 , feature_selection=True)
9 # functional API
10 #best = compare_models()
11
(...)
16
17 #plot_model(best,'feature')
File C:\ProgramData\Anaconda3\envs\pycaret_332_310_2\lib\site-packages\pycaret\classification\functional.py:595, in setup(data, data_func, target, index, train_size, test_data, ordinal_features, numeric_features, categorical_features, date_features, text_features, ignore_features, keep_features, preprocess, create_date_columns, imputation_type, numeric_imputation, categorical_imputation, iterative_imputation_iters, numeric_iterative_imputer, categorical_iterative_imputer, text_features_method, max_encoding_ohe, encoding_method, rare_to_value, rare_value, polynomial_features, polynomial_degree, low_variance_threshold, group_features, drop_groups, remove_multicollinearity, multicollinearity_threshold, bin_numeric_features, remove_outliers, outliers_method, outliers_threshold, fix_imbalance, fix_imbalance_method, transformation, transformation_method, normalize, normalize_method, pca, pca_method, pca_components, feature_selection, feature_selection_method, feature_selection_estimator, n_features_to_select, custom_pipeline, custom_pipeline_position, data_split_shuffle, data_split_stratify, fold_strategy, fold, fold_shuffle, fold_groups, n_jobs, use_gpu, html, session_id, system_log, log_experiment, experiment_name, experiment_custom_tags, log_plots, log_profile, log_data, verbose, memory, profile, profile_kwargs)
593 exp = _EXPERIMENT_CLASS()
594 set_current_experiment(exp)
--> 595 return exp.setup(
596 data=data,
597 data_func=data_func,
598 target=target,
599 index=index,
600 train_size=train_size,
601 test_data=test_data,
602 ordinal_features=ordinal_features,
603 numeric_features=numeric_features,
604 categorical_features=categorical_features,
605 date_features=date_features,
606 text_features=text_features,
607 ignore_features=ignore_features,
608 keep_features=keep_features,
609 preprocess=preprocess,
610 create_date_columns=create_date_columns,
611 imputation_type=imputation_type,
612 numeric_imputation=numeric_imputation,
613 categorical_imputation=categorical_imputation,
614 iterative_imputation_iters=iterative_imputation_iters,
615 numeric_iterative_imputer=numeric_iterative_imputer,
616 categorical_iterative_imputer=categorical_iterative_imputer,
617 text_features_method=text_features_method,
618 max_encoding_ohe=max_encoding_ohe,
619 encoding_method=encoding_method,
620 rare_to_value=rare_to_value,
621 rare_value=rare_value,
622 polynomial_features=polynomial_features,
623 polynomial_degree=polynomial_degree,
624 low_variance_threshold=low_variance_threshold,
625 group_features=group_features,
626 drop_groups=drop_groups,
627 remove_multicollinearity=remove_multicollinearity,
628 multicollinearity_threshold=multicollinearity_threshold,
629 bin_numeric_features=bin_numeric_features,
630 remove_outliers=remove_outliers,
631 outliers_method=outliers_method,
632 outliers_threshold=outliers_threshold,
633 fix_imbalance=fix_imbalance,
634 fix_imbalance_method=fix_imbalance_method,
635 transformation=transformation,
636 transformation_method=transformation_method,
637 normalize=normalize,
638 normalize_method=normalize_method,
639 pca=pca,
640 pca_method=pca_method,
641 pca_components=pca_components,
642 feature_selection=feature_selection,
643 feature_selection_method=feature_selection_method,
644 feature_selection_estimator=feature_selection_estimator,
645 n_features_to_select=n_features_to_select,
646 custom_pipeline=custom_pipeline,
647 custom_pipeline_position=custom_pipeline_position,
648 data_split_shuffle=data_split_shuffle,
649 data_split_stratify=data_split_stratify,
650 fold_strategy=fold_strategy,
651 fold=fold,
652 fold_shuffle=fold_shuffle,
653 fold_groups=fold_groups,
654 n_jobs=n_jobs,
655 use_gpu=use_gpu,
656 html=html,
657 session_id=session_id,
658 system_log=system_log,
659 log_experiment=log_experiment,
660 experiment_name=experiment_name,
661 experiment_custom_tags=experiment_custom_tags,
662 log_plots=log_plots,
663 log_profile=log_profile,
664 log_data=log_data,
665 verbose=verbose,
666 memory=memory,
667 profile=profile,
668 profile_kwargs=profile_kwargs,
669 )
File C:\ProgramData\Anaconda3\envs\pycaret_332_310_2\lib\site-packages\pycaret\classification\oop.py:890, in ClassificationExperiment.setup(self, data, data_func, target, index, train_size, test_data, ordinal_features, numeric_features, categorical_features, date_features, text_features, ignore_features, keep_features, preprocess, create_date_columns, imputation_type, numeric_imputation, categorical_imputation, iterative_imputation_iters, numeric_iterative_imputer, categorical_iterative_imputer, text_features_method, max_encoding_ohe, encoding_method, rare_to_value, rare_value, polynomial_features, polynomial_degree, low_variance_threshold, group_features, drop_groups, remove_multicollinearity, multicollinearity_threshold, bin_numeric_features, remove_outliers, outliers_method, outliers_threshold, fix_imbalance, fix_imbalance_method, transformation, transformation_method, normalize, normalize_method, pca, pca_method, pca_components, feature_selection, feature_selection_method, feature_selection_estimator, n_features_to_select, custom_pipeline, custom_pipeline_position, data_split_shuffle, data_split_stratify, fold_strategy, fold, fold_shuffle, fold_groups, n_jobs, use_gpu, html, session_id, system_log, log_experiment, experiment_name, experiment_custom_tags, log_plots, log_profile, log_data, engine, verbose, memory, profile, profile_kwargs)
887 if ("placeholder", None) in self.pipeline.steps and len(self.pipeline) > 1:
888 self.pipeline.steps.remove(("placeholder", None))
--> 890 self.pipeline.fit(self.X_train, self.y_train)
892 self.logger.info("Finished creating preprocessing pipeline.")
893 self.logger.info(f"Pipeline: {self.pipeline}")
File C:\ProgramData\Anaconda3\envs\pycaret_332_310_2\lib\site-packages\pycaret\internal\pipeline.py:273, in Pipeline.fit(self, X, y, **params)
271 def fit(self, X=None, y=None, **params):
272 routed_params = self._check_method_params(method="fit", props=params)
--> 273 X, y, _ = self._fit(X, y, routed_params)
275 with _print_elapsed_time("Pipeline", self._log_message(len(self.steps) - 1)):
276 if self._final_estimator != "passthrough":
File C:\ProgramData\Anaconda3\envs\pycaret_332_310_2\lib\site-packages\pycaret\internal\pipeline.py:256, in Pipeline._fit(self, X, y, routed_params)
248 # Fit or load the current transformer from cache
249 fitted_transformer = self._memory_fit(
250 transformer=cloned,
251 X=X,
(...)
254 params=routed_params.get(name, {}),
255 )
--> 256 X, y = self._memory_transform(
257 transformer=fitted_transformer,
258 X=X,
259 y=y,
260 )
262 # Replace the transformer of the step with the fitted
263 # transformer (necessary when loading from the cache)
264 self.steps[step_idx] = (name, fitted_transformer)
File C:\ProgramData\Anaconda3\envs\pycaret_332_310_2\lib\site-packages\joblib\memory.py:655, in MemorizedFunc.__call__(self, *args, **kwargs)
654 def __call__(self, *args, **kwargs):
--> 655 return self._cached_call(args, kwargs)[0]
File C:\ProgramData\Anaconda3\envs\pycaret_332_310_2\lib\site-packages\pycaret\internal\memory.py:392, in FastMemorizedFunc._cached_call(self, args, kwargs, shelving)
389 if must_call:
390 # PYCARET CHANGES
391 self._cached_output_identifiers = func_id, args_id
--> 392 out, metadata = self.call(*args, **kwargs)
393 if self.mmap_mode is not None and metadata is not None:
394 # PYCARET CHANGES END
395 # Memmap the output at the first call to be consistent with
396 # later calls
397 out = self.store_backend.load_item(
398 [func_id, args_id], msg=msg, verbose=self._verbose
399 )
File C:\ProgramData\Anaconda3\envs\pycaret_332_310_2\lib\site-packages\pycaret\internal\memory.py:308, in FastMemorizedFunc.call(self, *args, **kwargs)
306 # PYCARET CHANGES
307 func_start_time = time.monotonic()
--> 308 output = self.func(*args, **kwargs)
309 func_duration = time.monotonic() - func_start_time
310 if func_duration >= self.min_time_to_cache:
File C:\ProgramData\Anaconda3\envs\pycaret_332_310_2\lib\site-packages\pycaret\internal\pipeline.py:80, in _transform_one(transformer, X, y)
78 if "y" in signature(transformer.transform).parameters:
79 args.append(y)
---> 80 output = transformer.transform(*args)
82 if isinstance(output, tuple):
83 X, y = output[0], output[1]
File C:\ProgramData\Anaconda3\envs\pycaret_332_310_2\lib\site-packages\sklearn\utils\_set_output.py:295, in _wrap_method_output.<locals>.wrapped(self, X, *args, **kwargs)
293 @wraps(f)
294 def wrapped(self, X, *args, **kwargs):
--> 295 data_to_wrap = f(self, X, *args, **kwargs)
296 if isinstance(data_to_wrap, tuple):
297 # only wrap the first output for cross decomposition
298 return_tuple = (
299 _wrap_data_with_container(method, data_to_wrap[0], X, self),
300 *data_to_wrap[1:],
301 )
File C:\ProgramData\Anaconda3\envs\pycaret_332_310_2\lib\site-packages\pycaret\internal\preprocess\transformers.py:255, in TransformerWrapper.transform(self, X, y)
252 elif "X" not in transform_params:
253 return X, y
--> 255 output = self.transformer.transform(*args)
257 # Transform can return X, y or both
258 if isinstance(output, tuple):
File C:\ProgramData\Anaconda3\envs\pycaret_332_310_2\lib\site-packages\sklearn\utils\_set_output.py:295, in _wrap_method_output.<locals>.wrapped(self, X, *args, **kwargs)
293 @wraps(f)
294 def wrapped(self, X, *args, **kwargs):
--> 295 data_to_wrap = f(self, X, *args, **kwargs)
296 if isinstance(data_to_wrap, tuple):
297 # only wrap the first output for cross decomposition
298 return_tuple = (
299 _wrap_data_with_container(method, data_to_wrap[0], X, self),
300 *data_to_wrap[1:],
301 )
File C:\ProgramData\Anaconda3\envs\pycaret_332_310_2\lib\site-packages\sklearn\feature_selection\_base.py:107, in SelectorMixin.transform(self, X)
103 preserve_X = output_config_dense != "default" and _is_pandas_df(X)
105 # note: we use _safe_tags instead of _get_tags because this is a
106 # public Mixin.
--> 107 X = self._validate_data(
108 X,
109 dtype=None,
110 accept_sparse="csr",
111 force_all_finite=not _safe_tags(self, key="allow_nan"),
112 cast_to_ndarray=not preserve_X,
113 reset=False,
114 )
115 return self._transform(X)
File C:\ProgramData\Anaconda3\envs\pycaret_332_310_2\lib\site-packages\sklearn\base.py:608, in BaseEstimator._validate_data(self, X, y, reset, validate_separately, cast_to_ndarray, **check_params)
537 def _validate_data(
538 self,
539 X="no_validation",
(...)
544 **check_params,
545 ):
546 """Validate input data and set or check the `n_features_in_` attribute.
547
548 Parameters
(...)
606 validated.
607 """
--> 608 self._check_feature_names(X, reset=reset)
610 if y is None and self._get_tags()["requires_y"]:
611 raise ValueError(
612 f"This {self.__class__.__name__} estimator "
613 "requires y to be passed, but the target y is None."
614 )
File C:\ProgramData\Anaconda3\envs\pycaret_332_310_2\lib\site-packages\sklearn\base.py:535, in BaseEstimator._check_feature_names(self, X, reset)
530 if not missing_names and not unexpected_names:
531 message += (
532 "Feature names must be in the same order as they were in fit.\n"
533 )
--> 535 raise ValueError(message)
ValueError: The feature names should match those that were passed during fit.
Feature names unseen at fit time:
- 2-Hour serum insulin (mu U/ml)
- Age (years)
- Body mass index (weight in kg/(height in m)^2)
- Diabetes pedigree function
- Diastolic blood pressure (mm Hg)
- ...
Feature names seen at fit time, yet now missing:
- 2-Hour_serum_insulin_(mu_U/ml)
- Age_(years)
- Body_mass_index_(weight_in_kg/(height_in_m)^2)
- Diabetes_pedigree_function
- Diastolic_blood_pressure_(mm_Hg)
- ...
pycaret version checks
[X] I have checked that this issue has not already been reported here.
[X] I have confirmed this bug exists on the latest version of pycaret.
[x] I have confirmed this bug exists on the master branch of pycaret (pip install -U git+https://github.com/pycaret/pycaret.git@master).
Issue Description
When using the setup function with the feature_selection=True parameter in Pycaret 3.3.2, this error is returned. "ValueError: The feature names should match those that were passed during fit."
Reproducible Example
Expected Behavior
I expect the code to return a data table and a successful setup screen, which is what I get when I omit the feature_selection=True parameter.
Actual Results
Installed Versions