dmlc / xgboost

Scalable, Portable and Distributed Gradient Boosting (GBDT, GBRT or GBM) Library, for Python, R, Java, Scala, C++ and more. Runs on single machine, Hadoop, Spark, Dask, Flink and DataFlow
https://xgboost.readthedocs.io/en/stable/
Apache License 2.0
26.15k stars 8.71k forks source link

XGBoost Fails with Polars DataFrames Containing Categorical Columns #10554

Open mattharrison opened 3 months ago

mattharrison commented 3 months ago

XGBoost raises a ValueError when trying to train a model with a Polars dataframe with categorical data.

import polars as pl
import xgboost as xgb

X = pl.DataFrame({
    'num1': [1, 2, 3, 4],
    'cat1': ['A', 'B', 'A', 'B']
})

y = pl.Series('age', [10, 9, 8, 11])
X = (X
     .with_columns(cat1=pl.col('cat1').cast(pl.Categorical))
     )

clf = xgb.XGBRegressor(enable_categorical=True)
clf.fit(X, y)

Error:

{
    "name": "ValueError",
    "message": "could not convert string to float: 'A'",
    "stack": "---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
Cell In[7], line 17
     15 # Attempt to train an XGBoost model
     16 clf = xgb.XGBRegressor(enable_categorical=True)
---> 17 clf.fit(X, y)

File ~/.envs/menv/lib/python3.10/site-packages/xgboost/core.py:726, in require_keyword_args.<locals>.throw_if.<locals>.inner_f(*args, **kwargs)
    724 for k, arg in zip(sig.parameters, args):
    725     kwargs[k] = arg
--> 726 return func(**kwargs)

File ~/.envs/menv/lib/python3.10/site-packages/xgboost/sklearn.py:1081, in XGBModel.fit(self, X, y, sample_weight, base_margin, eval_set, verbose, xgb_model, sample_weight_eval_set, base_margin_eval_set, feature_weights)
   1079 with config_context(verbosity=self.verbosity):
   1080     evals_result: TrainingCallback.EvalsLog = {}
-> 1081     train_dmatrix, evals = _wrap_evaluation_matrices(
   1082         missing=self.missing,
   1083         X=X,
   1084         y=y,
   1085         group=None,
   1086         qid=None,
   1087         sample_weight=sample_weight,
   1088         base_margin=base_margin,
   1089         feature_weights=feature_weights,
   1090         eval_set=eval_set,
   1091         sample_weight_eval_set=sample_weight_eval_set,
   1092         base_margin_eval_set=base_margin_eval_set,
   1093         eval_group=None,
   1094         eval_qid=None,
   1095         create_dmatrix=self._create_dmatrix,
   1096         enable_categorical=self.enable_categorical,
   1097         feature_types=self.feature_types,
   1098     )
   1099     params = self.get_xgb_params()
   1101     if callable(self.objective):

File ~/.envs/menv/lib/python3.10/site-packages/xgboost/sklearn.py:596, in _wrap_evaluation_matrices(missing, X, y, group, qid, sample_weight, base_margin, feature_weights, eval_set, sample_weight_eval_set, base_margin_eval_set, eval_group, eval_qid, create_dmatrix, enable_categorical, feature_types)
    576 def _wrap_evaluation_matrices(
    577     missing: float,
    578     X: Any,
   (...)
    592     feature_types: Optional[FeatureTypes],
    593 ) -> Tuple[Any, List[Tuple[Any, str]]]:
    594     \"\"\"Convert array_like evaluation matrices into DMatrix.  Perform validation on the
    595     way.\"\"\"
--> 596     train_dmatrix = create_dmatrix(
    597         data=X,
    598         label=y,
    599         group=group,
    600         qid=qid,
    601         weight=sample_weight,
    602         base_margin=base_margin,
    603         feature_weights=feature_weights,
    604         missing=missing,
    605         enable_categorical=enable_categorical,
    606         feature_types=feature_types,
    607         ref=None,
    608     )
    610     n_validation = 0 if eval_set is None else len(eval_set)
    612     def validate_or_none(meta: Optional[Sequence], name: str) -> Sequence:

File ~/.envs/menv/lib/python3.10/site-packages/xgboost/sklearn.py:1008, in XGBModel._create_dmatrix(self, ref, **kwargs)
   1006     except TypeError:  # `QuantileDMatrix` supports lesser types than DMatrix
   1007         pass
-> 1008 return DMatrix(**kwargs, nthread=self.n_jobs)

File ~/.envs/menv/lib/python3.10/site-packages/xgboost/core.py:726, in require_keyword_args.<locals>.throw_if.<locals>.inner_f(*args, **kwargs)
    724 for k, arg in zip(sig.parameters, args):
    725     kwargs[k] = arg
--> 726 return func(**kwargs)

File ~/.envs/menv/lib/python3.10/site-packages/xgboost/core.py:878, in DMatrix.__init__(self, data, label, weight, base_margin, missing, silent, feature_names, feature_types, nthread, group, qid, label_lower_bound, label_upper_bound, feature_weights, enable_categorical, data_split_mode)
    875     assert self.handle is not None
    876     return
--> 878 handle, feature_names, feature_types = dispatch_data_backend(
    879     data,
    880     missing=self.missing,
    881     threads=self.nthread,
    882     feature_names=feature_names,
    883     feature_types=feature_types,
    884     enable_categorical=enable_categorical,
    885     data_split_mode=data_split_mode,
    886 )
    887 assert handle is not None
    888 self.handle = handle

File ~/.envs/menv/lib/python3.10/site-packages/xgboost/data.py:1259, in dispatch_data_backend(data, missing, threads, feature_names, feature_types, enable_categorical, data_split_mode)
   1257 if _has_array_protocol(data):
   1258     array = np.asarray(data)
-> 1259     return _from_numpy_array(array, missing, threads, feature_names, feature_types)
   1261 converted = _convert_unknown_data(data)
   1262 if converted is not None:

File ~/.envs/menv/lib/python3.10/site-packages/xgboost/data.py:253, in _from_numpy_array(data, missing, nthread, feature_names, feature_types, data_split_mode)
    251 \"\"\"Initialize data from a 2-D numpy matrix.\"\"\"
    252 _check_data_shape(data)
--> 253 data, _ = _ensure_np_dtype(data, data.dtype)
    254 handle = ctypes.c_void_p()
    255 _check_call(
    256     _LIB.XGDMatrixCreateFromDense(
    257         _array_interface(data),
   (...)
    264     )
    265 )

File ~/.envs/menv/lib/python3.10/site-packages/xgboost/data.py:224, in _ensure_np_dtype(data, dtype)
    222 if _array_hasobject(data) or data.dtype in [np.float16, np.bool_]:
    223     dtype = np.float32
--> 224     data = data.astype(dtype, copy=False)
    225 if not data.flags.aligned:
    226     data = np.require(data, requirements=\"A\")

ValueError: could not convert string to float: 'A'"
}

Expected result: That it works like pandas

import polars as pl
import xgboost as xgb

X = pl.DataFrame({
    'num1': [1, 2, 3, 4],
    'cat1': ['A', 'B', 'A', 'B']
})

y = pl.Series('age', [10, 9, 8, 11])
X = (X
     .with_columns(cat1=pl.col('cat1').cast(pl.Categorical))
     )

clf = xgb.XGBRegressor(enable_categorical=True)
clf.fit(X.to_pandas(), y)
trivialfis commented 3 months ago

We haven't supported polars yet. I'm hoping https://github.com/dmlc/xgboost/issues/10452 can help XGBoost manage the variety of dataframe inputs. We currently have cuDF, pandas, modin, pyarrow (no categorical support yet due to missing feature in pyarrow the last time we check), and datatable. The dispatching code along with the CI is getting out of hand now, especially with the many extensions that pandas has.