scikit-learn-contrib / MAPIE

A scikit-learn-compatible module to estimate prediction intervals and control risks based on conformal predictions.
https://mapie.readthedocs.io/en/latest/
BSD 3-Clause "New" or "Revised" License
1.2k stars 99 forks source link

Does MAPIE Regressor support categorical variables? #406

Open valeman opened 5 months ago

valeman commented 5 months ago

MAPIE Regressor with CatBoost with categorical variables works fine, however when using LightGBM it seems to return error ' ValueError: could not convert string to float: 'class 1'

vincentblot28 commented 5 months ago

Hi @valeman, could you provide some code so that we can see the error ?

valeman commented 5 months ago

Here @vincentblot28, the code runs fine with underlying regressor CatBoost but gives out error with LightGBM.

conformity_score = ResidualNormalisedScore(residual_estimator=sd_predictor, prefit=True) mapie_regressor = MapieRegressor(mean_predictor, conformity_score=conformity_score, cv='prefit', method='base') mapie_regressor.fit(X_calib, y_calib) y_pred, y_pis = mapie_regressor.predict(X_test, alpha=0.1)

'--------------------------------------------------------------------------- ValueError Traceback (most recent call last) Cell In[135], line 3 1 conformity_score = ResidualNormalisedScore(residual_estimator=sd_predictor, prefit=True) 2 mapie_regressor = MapieRegressor(mean_predictor, conformity_score=conformity_score, cv='prefit', method='base') ----> 3 mapie_regressor.fit(X_calib, y_calib) 4 y_pred, y_pis = mapie_regressor.predict(X_test, alpha=0.1)

File ~/miniconda3/envs/py39/lib/python3.9/site-packages/mapie/regression/regression.py:539, in MapieRegressor.fit(self, X, y, sample_weight) 536 else: 537 ypred = self.estimator.predict_calib(X) 538 self.conformityscores = \ --> 539 self.conformity_scorefunction.get_conformity_scores( 540 X, y, y_pred 541 ) 543 return self

File ~/miniconda3/envs/py39/lib/python3.9/site-packages/mapie/conformity_scores/conformity_scores.py:211, in ConformityScore.get_conformity_scores(self, X, y, y_pred) 186 def get_conformity_scores( 187 self, 188 X: ArrayLike, 189 y: ArrayLike, 190 y_pred: ArrayLike, 191 ) -> NDArray: 192 """ 193 Get the conformity score considering the symmetrical property if so. 194 (...) 209 Conformity scores. 210 """ --> 211 conformity_scores = self.get_signed_conformity_scores(X, y, y_pred) 212 if self.consistency_check: 213 self.check_consistency(X, y, y_pred, conformity_scores)

File ~/miniconda3/envs/py39/lib/python3.9/site-packages/mapie/conformity_scores/residual_conformity_scores.py:403, in ResidualNormalisedScore.get_signed_conformity_scores(self, X, y, y_pred) 400 else: 401 cal_indexes = full_indexes 402 residuals_pred = np.maximum( --> 403 self._predict_residual_estimator(X[cal_indexes]), 404 self.eps 405 ) 407 signed_conformity_scores = np.divide( 408 np.subtract(y[cal_indexes], y_pred[cal_indexes]), 409 residuals_pred 410 ) 412 # reconstruct array with nan and conformity scores

File ~/miniconda3/envs/py39/lib/python3.9/site-packages/mapie/conformity_scores/residual_conformity_scores.py:352, in ResidualNormalisedScore._predict_residual_estimator(self, X) 327 def _predict_residual_estimator( 328 self, 329 X: ArrayLike 330 ) -> NDArray: 331 """ 332 Returns the predictions of the residual estimator. Raises a warning if 333 the model predicts neagtive values. (...) 350 the residuals and predict the exponential of the predictions. 351 """ --> 352 pred = self.residualestimator.predict(X) 353 if self.prefit and np.any(pred < 0): 354 warnings.warn( 355 "WARNING: The residual model predicts negative values, " 356 + "they are later thresholded at self.eps." (...) 359 + "the exponential of the predictions." 360 )

File ~/miniconda3/envs/py39/lib/python3.9/site-packages/lightgbm/sklearn.py:934, in LGBMModel.predict(self, X, raw_score, start_iteration, num_iteration, pred_leaf, pred_contrib, validate_features, **kwargs) 932 raise LGBMNotFittedError("Estimator not fitted, call fit before exploiting the model.") 933 if not isinstance(X, (pd_DataFrame, dt_DataTable)): --> 934 X = _LGBMCheckArray(X, accept_sparse=True, force_all_finite=False) 935 n_features = X.shape[1] 936 if self._n_features != n_features:

File ~/miniconda3/envs/py39/lib/python3.9/site-packages/sklearn/utils/validation.py:915, in check_array(array, accept_sparse, accept_large_sparse, dtype, order, copy, force_all_finite, ensure_2d, allow_nd, ensure_min_samples, ensure_min_features, estimator, input_name) 913 array = xp.astype(array, dtype, copy=False) 914 else: --> 915 array = _asarray_with_order(array, order=order, dtype=dtype, xp=xp) 916 except ComplexWarning as complex_warning: 917 raise ValueError( 918 "Complex data not supported\n{}\n".format(array) 919 ) from complex_warning

File ~/miniconda3/envs/py39/lib/python3.9/site-packages/sklearn/utils/_array_api.py:380, in _asarray_with_order(array, dtype, order, copy, xp) 378 array = numpy.array(array, order=order, dtype=dtype) 379 else: --> 380 array = numpy.asarray(array, order=order, dtype=dtype) 382 # At this point array is a NumPy ndarray. We convert it to an array 383 # container that is consistent with the input's namespace. 384 return xp.asarray(array)

ValueError: could not convert string to float: 'class 1''

vincentblot28 commented 5 months ago

Thanks, I think I would need more details (for instance, what is you sd_estimator ?). Could you give a reproducible example so I can run it ?

salmuz commented 4 months ago

@vincentblot28 It think this problem is related to the fact that we can pass to LightGBM a DataFrame with categorical columns to get a prediction. However, if we pass the same information in a numpy format, LightGBM throws an exception.

@valeman Below a simple fix that works with LightGBM.

def get_signed_conformity_scores(
        self,
        X: ArrayLike,
        y: ArrayLike,
        y_pred: ArrayLike
    ) -> NDArray:
         # .....
         (X_array, y_array, y_pred,
         self.residual_estimator_,
         random_state) = self._check_parameters(X, y, y_pred)

        full_indexes = np.argwhere(
            np.logical_not(np.isnan(y_pred))
        ).reshape((-1,))

        if not self.prefit:
            cal_indexes, res_indexes = train_test_split(
                full_indexes,
                test_size=self.split_size,
                random_state=random_state,
            )

            # ToDo: Check how workaround that
            X_array = pd.DataFrame(X_array, columns=X.columns)
            X_array = X_array.astype(X.dtypes.to_dict())

            self.residual_estimator_ = self._fit_residual_estimator(
                clone(self.residual_estimator_),
                X_array.iloc[res_indexes],
                y_array[res_indexes], 
                y_pred[res_indexes]
            )
            residuals_pred = np.maximum(
                np.exp(self._predict_residual_estimator(X_array.iloc[cal_indexes])),
                self.eps
            )
        else:
            X_array = pd.DataFrame(X_array, columns=X.columns)
            X_array = X_array.astype(X.dtypes.to_dict())

            cal_indexes = full_indexes
            residuals_pred = np.maximum(
                self._predict_residual_estimator(X_array.iloc[cal_indexes]),
                self.eps
            )
    #.....