rasbt / mlxtend

A library of extension and helper modules for Python's data analysis and machine learning libraries.
https://rasbt.github.io/mlxtend/
Other
4.82k stars 853 forks source link

How to use categorical data without one hot encoding in SequentialFeatureSelector, I am receiving an error #1093

Open ago302 opened 2 months ago

ago302 commented 2 months ago

Hi, This is my code but I am receiving and error, Could you please hep me with this error?

import pandas as pd import numpy as np from catboost import CatBoostRegressor, Pool, EShapCalcType, EFeaturesSelectionAlgorithm from sklearn.datasets import make_regression from sklearn.model_selection import train_test_split

Generating numerical features

X, y = make_regression(n_samples=100, n_features=9, n_informative=2, noise=0.1, random_state=0)

Generating categorical features

X_categorical = np.random.choice(['pooh', 'rabbit', 'piglet', 'Christopher'], size=(100, 2))

Combine into a DataFrame

X_combined = np.hstack((X, X_categorical)) feature_names = ['F{}'.format(i) for i in range(X_combined.shape[1])] df = pd.DataFrame(X_combined, columns=feature_names) df[['F9', 'F10']]=df[['F9', 'F10']].astype("category") df.loc[:, ~df.columns.isin(['F9', 'F10'])]=df.loc[:, ~df.columns.isin(['F9', 'F10'])].astype("float") num_col=df.columns.drop(['F9', 'F10']) for column in num_col: df[column] = pd.to_numeric(df[column], errors='coerce')

X_train, X_test, y_train, y_test = train_test_split(df, y, test_size=0.25, random_state=0)

import xgboost as xgb clf = xgb.XGBRegressor( enable_categorical=True) clf.fit(X_train, y_train)

from mlxtend.feature_selection import SequentialFeatureSelector as SFS import xgboost as xgb clf = xgb.XGBRegressor( enable_categorical=True) sfs1 = SFS(clf, k_features="best", forward=False, floating=False, verbose=2, scoring="neg_mean_absolute_error", clone_estimator=False, n_jobs=1, cv=0, )

sfs1 = sfs1.fit(X_train, y_train)

and this is the error


ValueError Traceback (most recent call last) Cell In[47], line 13 3 clf = xgb.XGBRegressor( enable_categorical=True) 4 sfs1 = SFS(clf, 5 k_features="best", 6 forward=False, (...) 10 n_jobs=-1, 11 cv=5) ---> 13 sfs1 = sfs1.fit(X_train, y_train)

File c:\Users\Aligo\AppData\Local\Programs\Python\Python310\lib\site-packages\mlxtend\feature_selection\sequential_feature_selector.py:518, in SequentialFeatureSelector.fit(self, X, y, groups, fit_params) 516 k = len(k_idx) 517 if k > 0: --> 518 k_idx, k_score = _calc_score( 519 self, 520 X_, 521 y, 522 k_idx, 523 groups=groups, 524 feature_groups=self.featuregroups, 525 fit_params, 526 ) 527 self.subsets_[k] = { 528 "feature_idx": k_idx, 529 "cv_scores": k_score, 530 "avg_score": np.nanmean(k_score), 531 } 533 orig_set = set(range(self.k_ub))

File c:\Users\Aligo\AppData\Local\Programs\Python\Python310\lib\site-packages\mlxtend\feature_selection\utilities.py:98, in _calc_score(selector, X, y, indices, groups, feature_groups, fit_params) 96 IDX = _merge_lists(feature_groups, indices) 97 if selector.cv: ---> 98 scores = cross_val_score( 99 selector.est_, 100 X[:, IDX], 101 y, 102 groups=groups, 103 cv=selector.cv, 104 scoring=selector.scorer, 105 n_jobs=1, 106 pre_dispatch=selector.pre_dispatch, 107 fit_params=fit_params, 108 ) 109 else: 110 selector.est_.fit(X[:, IDX], y, fit_params)

File c:\Users\Aligo\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\utils_param_validation.py:213, in validate_params..decorator..wrapper(*args, *kwargs) 207 try: 208 with config_context( 209 skip_parameter_validation=( 210 prefer_skip_nested_validation or global_skip_validation 211 ) 212 ): --> 213 return func(args, **kwargs) 214 except InvalidParameterError as e: 215 # When the function is just a wrapper around an estimator, we allow 216 # the function to delegate validation to the estimator, but we replace 217 # the name of the estimator by the name of the function in the error 218 # message to avoid confusion. 219 msg = re.sub( 220 r"parameter of \w+ must be", 221 f"parameter of {func.qualname} must be", 222 str(e), 223 )

File c:\Users\Aligo\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\model_selection_validation.py:719, in cross_val_score(estimator, X, y, groups, scoring, cv, n_jobs, verbose, fit_params, params, pre_dispatch, error_score) 716 # To ensure multimetric format is not supported 717 scorer = check_scoring(estimator, scoring=scoring) --> 719 cv_results = cross_validate( 720 estimator=estimator, 721 X=X, 722 y=y, 723 groups=groups, 724 scoring={"score": scorer}, 725 cv=cv, 726 n_jobs=n_jobs, 727 verbose=verbose, 728 fit_params=fit_params, 729 params=params, 730 pre_dispatch=pre_dispatch, 731 error_score=error_score, 732 ) 733 return cv_results["test_score"]

File c:\Users\Aligo\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\utils_param_validation.py:213, in validate_params..decorator..wrapper(*args, *kwargs) 207 try: 208 with config_context( 209 skip_parameter_validation=( 210 prefer_skip_nested_validation or global_skip_validation 211 ) 212 ): --> 213 return func(args, **kwargs) 214 except InvalidParameterError as e: 215 # When the function is just a wrapper around an estimator, we allow 216 # the function to delegate validation to the estimator, but we replace 217 # the name of the estimator by the name of the function in the error 218 # message to avoid confusion. 219 msg = re.sub( 220 r"parameter of \w+ must be", 221 f"parameter of {func.qualname} must be", 222 str(e), 223 )

File c:\Users\Aligo\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\model_selection_validation.py:450, in cross_validate(estimator, X, y, groups, scoring, cv, n_jobs, verbose, fit_params, params, pre_dispatch, return_train_score, return_estimator, return_indices, error_score) 429 parallel = Parallel(n_jobs=n_jobs, verbose=verbose, pre_dispatch=pre_dispatch) 430 results = parallel( 431 delayed(_fit_and_score)( 432 clone(estimator), (...) 447 for train, test in indices 448 ) --> 450 _warn_or_raise_about_fit_failures(results, error_score) 452 # For callable scoring, the return type is only know after calling. If the 453 # return type is a dictionary, the error scores can now be inserted with 454 # the correct key. 455 if callable(scoring):

File c:\Users\Aligo\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\model_selection_validation.py:536, in _warn_or_raise_about_fit_failures(results, error_score) 529 if num_failed_fits == num_fits: 530 all_fits_failed_message = ( 531 f"\nAll the {num_fits} fits failed.\n" 532 "It is very likely that your model is misconfigured.\n" 533 "You can try to debug the error by setting error_score='raise'.\n\n" 534 f"Below are more details about the failures:\n{fit_errors_summary}" 535 ) --> 536 raise ValueError(all_fits_failed_message) 538 else: 539 some_fits_failed_message = ( 540 f"\n{num_failed_fits} fits failed out of a total of {num_fits}.\n" 541 "The score on these train-test partitions for these parameters" (...) 545 f"Below are more details about the failures:\n{fit_errors_summary}" 546 )

ValueError: All the 5 fits failed. It is very likely that your model is misconfigured. You can try to debug the error by setting error_score='raise'.

Below are more details about the failures:

1 fits failed with the following error: Traceback (most recent call last): File "c:\Users\Aligo\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\model_selection_validation.py", line 895, in _fit_and_score estimator.fit(X_train, y_train, fit_params) File "c:\Users\Aligo\AppData\Local\Programs\Python\Python310\lib\site-packages\xgboost\core.py", line 730, in inner_f return func(kwargs) File "c:\Users\Aligo\AppData\Local\Programs\Python\Python310\lib\site-packages\xgboost\sklearn.py", line 1055, in fit train_dmatrix, evals = _wrap_evaluation_matrices( File "c:\Users\Aligo\AppData\Local\Programs\Python\Python310\lib\site-packages\xgboost\sklearn.py", line 521, in _wrap_evaluation_matrices train_dmatrix = create_dmatrix( File "c:\Users\Aligo\AppData\Local\Programs\Python\Python310\lib\site-packages\xgboost\sklearn.py", line 958, in _create_dmatrix return QuantileDMatrix( File "c:\Users\Aligo\AppData\Local\Programs\Python\Python310\lib\site-packages\xgboost\core.py", line 730, in inner_f return func(kwargs) File "c:\Users\Aligo\AppData\Local\Programs\Python\Python310\lib\site-packages\xgboost\core.py", line 1529, in init self._init( File "c:\Users\Aligo\AppData\Local\Programs\Python\Python310\lib\site-packages\xgboost\core.py", line 1588, in _init it.reraise() File "c:\Users\Aligo\AppData\Local\Programs\Python\Python310\lib\site-packages\xgboost\core.py", line 576, in reraise raise exc # pylint: disable=raising-bad-type File "c:\Users\Aligo\AppData\Local\Programs\Python\Python310\lib\site-packages\xgboost\core.py", line 557, in _handle_exception return fn() File "c:\Users\Aligo\AppData\Local\Programs\Python\Python310\lib\site-packages\xgboost\core.py", line 641, in return self._handle_exception(lambda: self.next(input_data), 0) File "c:\Users\Aligo\AppData\Local\Programs\Python\Python310\lib\site-packages\xgboost\data.py", line 1280, in next input_data(self.kwargs) File "c:\Users\Aligo\AppData\Local\Programs\Python\Python310\lib\site-packages\xgboost\core.py", line 730, in inner_f return func(**kwargs) File "c:\Users\Aligo\AppData\Local\Programs\Python\Python310\lib\site-packages\xgboost\core.py", line 624, in input_data new, cat_codes, feature_names, feature_types = _proxy_transform( File "c:\Users\Aligo\AppData\Local\Programs\Python\Python310\lib\site-packages\xgboost\data.py", line 1305, in _proxytransform data, = _ensure_np_dtype(data, data.dtype) File "c:\Users\Aligo\AppData\Local\Programs\Python\Python310\lib\site-packages\xgboost\data.py", line 176, in _ensure_np_dtype data = data.astype(dtype, copy=False) ValueError: could not convert string to float: 'piglet'


4 fits failed with the following error: Traceback (most recent call last): File "c:\Users\Aligo\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\model_selection_validation.py", line 895, in _fit_and_score estimator.fit(X_train, y_train, fit_params) File "c:\Users\Aligo\AppData\Local\Programs\Python\Python310\lib\site-packages\xgboost\core.py", line 730, in inner_f return func(kwargs) File "c:\Users\Aligo\AppData\Local\Programs\Python\Python310\lib\site-packages\xgboost\sklearn.py", line 1055, in fit train_dmatrix, evals = _wrap_evaluation_matrices( File "c:\Users\Aligo\AppData\Local\Programs\Python\Python310\lib\site-packages\xgboost\sklearn.py", line 521, in _wrap_evaluation_matrices train_dmatrix = create_dmatrix( File "c:\Users\Aligo\AppData\Local\Programs\Python\Python310\lib\site-packages\xgboost\sklearn.py", line 958, in _create_dmatrix return QuantileDMatrix( File "c:\Users\Aligo\AppData\Local\Programs\Python\Python310\lib\site-packages\xgboost\core.py", line 730, in inner_f return func(kwargs) File "c:\Users\Aligo\AppData\Local\Programs\Python\Python310\lib\site-packages\xgboost\core.py", line 1529, in init self._init( File "c:\Users\Aligo\AppData\Local\Programs\Python\Python310\lib\site-packages\xgboost\core.py", line 1588, in _init it.reraise() File "c:\Users\Aligo\AppData\Local\Programs\Python\Python310\lib\site-packages\xgboost\core.py", line 576, in reraise raise exc # pylint: disable=raising-bad-type File "c:\Users\Aligo\AppData\Local\Programs\Python\Python310\lib\site-packages\xgboost\core.py", line 557, in _handle_exception return fn() File "c:\Users\Aligo\AppData\Local\Programs\Python\Python310\lib\site-packages\xgboost\core.py", line 641, in return self._handle_exception(lambda: self.next(input_data), 0) File "c:\Users\Aligo\AppData\Local\Programs\Python\Python310\lib\site-packages\xgboost\data.py", line 1280, in next input_data(self.kwargs) File "c:\Users\Aligo\AppData\Local\Programs\Python\Python310\lib\site-packages\xgboost\core.py", line 730, in inner_f return func(**kwargs) File "c:\Users\Aligo\AppData\Local\Programs\Python\Python310\lib\site-packages\xgboost\core.py", line 624, in input_data new, cat_codes, feature_names, feature_types = _proxy_transform( File "c:\Users\Aligo\AppData\Local\Programs\Python\Python310\lib\site-packages\xgboost\data.py", line 1305, in _proxytransform data, = _ensure_np_dtype(data, data.dtype) File "c:\Users\Aligo\AppData\Local\Programs\Python\Python310\lib\site-packages\xgboost\data.py", line 176, in _ensure_np_dtype data = data.astype(dtype, copy=False) ValueError: could not convert string to float: 'Christopher'

Why this package does not work with categorical feature?