loft-br / xgboost-survival-embeddings

Improving XGBoost survival analysis with embeddings and debiased estimators
https://loft-br.github.io/xgboost-survival-embeddings/
Apache License 2.0
313 stars 51 forks source link

XGBSE error with sklearn pipeline and GridSearchCV #61

Open jules-germany opened 1 year ago

jules-germany commented 1 year ago

Code sample

__

# Your code here

import pandas as pd
import numpy as np
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import mutual_info_classif
from xgbse import XGBSEKaplanNeighbors
from sklearn.preprocessing import RobustScaler
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import Pipeline
import itertools
import time

[Radiomics_Clinical_GMMC_2022_09_22-CT_der_PET-mix.csv](https://github.com/loft-br/xgboost-survival-embeddings/files/9636594/Radiomics_Clinical_GMMC_2022_09_22-CT_der_PET-mix.csv)

csv_path = "C:/Radiomics_Clinical_GMMC_2022_09_22-CT_der_PET-mix.csv"

df = pd.read_csv(csv_path)
df = df.set_index('cohort', drop=True)
df.index.rename('index', inplace=True)
df.index = df.index.astype(int)

X = df.drop(columns=['PFS_binaer_Progress', 'Ereignis_korrigiert_Update_2021_03', 'DFS_M_ED_Update_2021_03',
                     'Pseudonym', 'train_test_mix', 'SUVmax', 'SEX_SPSS', 
                     'DIAGECOG_komplett_ueber_1', 'DIAALTER', 'PTNM_T_SPSS_korr_grob_7th',
                     'PTNM_N_SPSS_korr', 'STADIUM_GROB_SPSS_7thEdition',
                     'R_Status', 'PTNM_T_SPSS_korr_7th', 'STADIUM_SPSS_7thEdition',
                     'Histo_Subtyp', 'NEOADJ_CHEMO', 'NEOADJ_BESTR', 'ADJ_CHEMO', 'ADJ_BESTR',
                     'ANY_CHEMO', 'ANY_BESTR', 'ASP_high_19_5', 'ASP', 'ASP_high_33_3'])

y_df = pd.DataFrame()
y_df['event'] = df['Ereignis_korrigiert_Update_2021_03'].astype(bool)
y_df['time'] = df['DFS_M_ED_Update_2021_03']

# split dataframe into training+test and validation cohort

X_train_test = X.iloc[X.index.isin([1])]
X_valid = X.iloc[X.index.isin([2])]

y_train_test_df = y_df.iloc[y_df.index.isin([1])]
y_valid_df = y_df.iloc[y_df.index.isin([2])]

s = y_df.dtypes
y_train_test = np.array([tuple(x) for x in y_train_test_df.values], dtype=list(zip(s.index, s)))
y_valid = np.array([tuple(x) for x in y_valid_df.values], dtype=list(zip(s.index, s)))

def score_survival_model(model, X, y):
    prediction = model.predict(X)
    result = concordance_index_censored(y['event'], y['time'], prediction)
    return result[0]

feature_select_dict = {
                        "MIC" : SelectKBest(mutual_info_classif, k=30),                        
                        }

p_grid_dict = {"xgbse" : {"estimator__objective": ["survival:aft"],
                             "estimator__eval_metric": ["aft-nloglik"],
                             "estimator__aft_loss_distribution": ["normal", "logistic"],
                             "estimator__aft_loss_distribution_scale": [0.5, 1.0, 1.5],
                             "estimator__tree_method": ["hist"],
                             "estimator__learning_rate": np.logspace(-2, 0, num=6),
                             "estimator__max_depth": [0, 1, 5, 10, 15, 25, 50],
                             "estimator__booster": ["dart"],
                             "estimator__subsample": [0.1, 0.2, 0.4, 0.6, 0.8, 1.0],
                             "estimator__min_child_weight": [1, 2, 5, 10],
                             "estimator__colsample_bynode": [0.5, 1.0, 2.0]}
                        }

models_dict = {
                "xgbse" : XGBSEKaplanNeighbors(xgb_params=p_grid_dict['xgbse'])
                }

inner_cv = sklearn.model_selection.KFold(n_splits=10, shuffle=True, random_state=1)
outer_cv = sklearn.model_selection.KFold(n_splits=10, shuffle=True, random_state=1)

def model_scores(feature_select_dict, models_dict, p_grid_dict, X_train, y_train, X_valid, y_valid):

    # define the scaler and prepare empty dict or dataframe to assemble results and best parameters

    scaler = RobustScaler(with_centering = False)
    models_df_dict = dict()
    params_df = pd.DataFrame()    

    for outerKey in feature_select_dict:

        models = pd.DataFrame()  
        feature_select = feature_select_dict[outerKey]

        for innerKey in models_dict:

            # instantiate model

            model = models_dict[innerKey]
            p_grid = p_grid_dict[innerKey]

            # inner loop of nested cross-validation: perform hyperparameter tuning in the training set of the outer loop

            t1 = time.time()

            pipeline = Pipeline([('scaling', scaler), ('feature_selection', feature_select), ('estimator', model)])

            clf_model = sklearn.model_selection.GridSearchCV(estimator=pipeline, 
                                                             scoring=score_survival_model,
                                                             param_grid=p_grid, 
                                                             cv=inner_cv, refit=True) 

            # outer loop: train the model with training data and score the perfomance on test sets
            nested_test_score = sklearn.model_selection.cross_val_score(clf_model, scoring=score_survival_model,
                                                                        X=X_train, y=y_train, cv=outer_cv)

            # calculate AUC from test and validation set

            test_mean = nested_test_score.mean()
            test_std = nested_test_score.std()

            clf_model_fit = clf_model.fit(X_train, y_train)

            clf_model_best_parameters = str(clf_model.best_params_)

            valid_score = clf_model.score(X_valid, y_valid)

            test_plus_valid = test_mean + valid_score

            model_time = (time.time()-t1)

            # at the end of this nested CV: add results for this model to the models dataframe 

            models[innerKey] = [test_mean, test_std, model_time, valid_score, test_plus_valid]

            df_list = [outerKey, innerKey, clf_model_best_parameters]
            params = pd.DataFrame(df_list)
            params_df = pd.concat([params_df, params], axis = 1)

        # add model results for different feature_select_dict keys to the dict of model results
        # add best parmaeters to the dataframe

        models_df_dict[outerKey] = models 

    # subsequent to all model calculations: add multiindex, flip (transpose) dataframe and sort by highest "test+valid"
    # finalize the "best parameters" dataframe

    multiindex = {(outerKey, innerKey): values for outerKey, innerDict in models_df_dict.items() for innerKey, values in innerDict.items()}

    models_df_dict_multiindex = pd.DataFrame(multiindex)
    models_df_dict_multiindex.index = ['nested_test_mean', 'nested_test_SD', 'time', 'valid', 'test_plus_valid']

    models_transpose = models_df_dict_multiindex.transpose()
    models_transpose.index.set_names(['pre', 'model'], inplace=True)
    models_transpose = models_transpose.sort_values(by = ['nested_test_mean'], ascending = False)

    params_df = params_df.T
    params_df.columns = ['feature_select', 'model', 'parameters']
    params_df = params_df.sort_values(by = ['model', 'feature_select'], ascending = [True, True])

    return models_transpose, params_df

results, params = model_scores(feature_select_dict, models_dict, p_grid_dict, X_train_test, y_train_test, X_valid, y_valid)
results_ready = results.reset_index(level=['pre', 'model'])
print(results_ready)

Problem description

__

Use of GridSearchCV is not possible because XGBSE requires hyperparameters to be unique and to be passed during model initiation. Furthermore, parameter vales in the parameter dict need to be without [], while GridSearchCV expects values in []. XGBSE therefore seems to be incompatible with GridSearchCV. Furthermore, XGBSE seems to be incompatible with sklearn's pipeline. If the sklearn pipeline is fitted, the estimator XGBSE receives the X dataframe as a np.array in the last step of the pipeline, which misses an index. This gives a corresponding error because XGBSE fitting seems to require X.index.

Expected behavior

__ It would be expected that XGBSE can be used with GridSearchCV and pipeline.

Possible solutions

__ It would be required that hyperparameters could be defined and that fitting would allow X without an index (np.array).

thecml commented 1 year ago

I stumbled into the same issue. Is there any workout to this, other than tuning hyperparameters with a vanilla XGBoost model and simply transferring those to the XGBSe one?

brunocarlin commented 1 year ago

We have developed a workaround

from xgbse import XGBSEStackedWeibull

class sklearn_wei(XGBSEStackedWeibull):

    def get_params2(self):
        return(self.get_params()['xgb_params'])

    def set_params(self,**params):
      old_params = self.get_params2()
      old_params.update(params)
      self.xgb_params = old_params
      return(self)

ok = sklearn_wei()

`After you define the sklearn compatible model things work out as normal

from skopt import BayesSearchCV

from sklearn.model_selection import cross_val_score
from sklearn.metrics import make_scorer
from sklearn.model_selection import StratifiedKFold

from xgbse.metrics import (
    concordance_index,
    approx_brier_score,
    dist_calibration_score
)
opt = BayesSearchCV(
  estimator= ok,
  scoring=make_scorer(concordance_index),
  n_iter = 2,random_state = 42,
  cv = StratifiedKFold(n_splits = 4, shuffle = True),
  n_jobs = -1,
  n_points = 1,
  verbose = 1,
  search_spaces = {
    'max_depth': space.Integer(low = 1,high =32,prior = 'uniform'), # I don't know why but more than 32 breaks
    'learning_rate': space.Real(10**-3, 10**-1, "log-uniform"),
    #'tree_method': space.Categorical(categories=['hist','exact']),
    'aft_loss_distribution': space.Categorical(categories=['normal','extreme','logistic']),
    #'aft_loss_distribution_scale': space.Real(low=10**-2,high= 3*10**1,prior = "log-uniform"),
    #'n_estimators': space.Real(low = 10**1,high = 10**3,prior = "log-uniform"),
    'subsample':space.Real(low = .8,high = 1,prior = "uniform"),
    'colsample_bytree':space.Real(low = .8,high = 1,prior = "uniform"),
    'reg_alpha': space.Real(low=0,high = 1,prior = "uniform"),
    'reg_lambda': space.Real(low=0,high = 1,prior = "uniform"),
    'max_bin': space.Integer(low=200,high = 1000,prior = 'log-uniform'),
    'min_split_loss' : space.Real(low = 0,high = 10,prior = 'uniform')
  },
  fit_params = {}
 )
opt.fit(X = X_train,
        y = y_train,
        time_bins = range(1,59),
        validation_data= (X_valid,y_valid),
        early_stopping_rounds=5)
df_results = \
  pd.DataFrame(opt.cv_results_).sort_values(by = 'rank_test_score', ascending = True)

df_results['params'] = df_results['params'].astype('string')
df_results.reset_index(drop = True, inplace = True)
df_results