fabsig / GPBoost

Combining tree-boosting with Gaussian process and mixed effects models
Other
574 stars 46 forks source link

Predictions underfit #106

Closed VHolstein closed 1 year ago

VHolstein commented 1 year ago

I'm currently using your package inside of another python package (photonai) to build a model pipeline. This is done to run a number of different models through the same pipeline. I have integrated your GPBoostRegressor as follows below. For testing purposes I'm training this model on a simple dataframe where X is a vector of numbers and y is the sum of these numbers. I was expecting a mean of absolute error of around 0, but this is not the case. Any suggestions what I might be doing wrong when integrating your model like this?

from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.model_selection import GroupKFold, KFold
from photonai.base import Hyperpipe, PipelineElement
from sklearn.preprocessing import StandardScaler
import numpy as np
import pandas as pd
import gpboost as gpb

class GPBoostDataWrapper(BaseEstimator, ClassifierMixin):

    def __init__(self):
        self.needs_covariates = True
        # TODO: Sklearn example: https://github.com/fabsig/GPBoost/blob/master/examples/python-guide/sklearn_example.py
        # TODO: python guide: https://github.com/fabsig/GPBoost/blob/master/examples/python-guide/generalized_linear_Gaussian_process_mixed_effects_models.py
        # self.gpmodel = gpb.GPModel(likelihood="gaussian")
        self.gpboost = gpb.GPBoostRegressor()

    def fit(self, X, y, **kwargs):
        if "cluster" and "Z" in kwargs:
            clst = pd.Series(kwargs["clusters"])
            gpmodel = gpb.GPModel(likelihood="gaussian", group_data=clst)
            self.gpboost.fit(X, y, gp_model=gpmodel)
        else:
            raise NotImplementedError("GPBoost needs clusters")
        return self

    def predict(self, X, **kwargs):
        clst = pd.Series(kwargs["clusters"])
        preds = self.gpboost.predict(X, group_data_pred=clst)
        preds = preds["response_mean"]
        return preds

    def save(self):
        return None

def get_gpboost_pipe(pipe_name, project_folder, split="group"):

    if split == "group":
        outercv = GroupKFold(n_splits=10)
    else:
        outercv = KFold(n_splits=10)

    my_pipe = Hyperpipe(pipe_name,
                        optimizer='grid_search',
                        metrics=['mean_absolute_error', 'mean_squared_error',
                                 'spearman_correlation', 'pearson_correlation'],
                        best_config_metric='mean_absolute_error',
                        outer_cv=outercv,
                        inner_cv=KFold(n_splits=10),
                        calculate_metrics_across_folds=True,
                        use_test_set=True,
                        verbosity=1,
                        project_folder=project_folder)

    # Add transformer elements
    my_pipe += PipelineElement("StandardScaler", hyperparameters={},
                               test_disabled=True, with_mean=True, with_std=True)

    my_pipe += PipelineElement.create("GPBoost", GPBoostDataWrapper(), hyperparameters={})

    return my_pipe
fabsig commented 1 year ago

Thanks for using GPBoost!

Can you please provide a reproducible example?

VHolstein commented 1 year ago

Yes sure! Sorry for not providing this earlier. I made a mock example, very similar to the data I was using to debug my code. The model produces a mean absolute error quite similar to the Dummy Estimator (which just guesses the mean for every sample). In theory the model should be able to esimate the target accurately as it is just the sum of all the values in a row. I'm sure I'm making an error in the way I use your code, I just have not been able to figure it out.

from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.model_selection import GroupKFold, KFold
from photonai.base import Hyperpipe, PipelineElement
import numpy as np
import pandas as pd
import gpboost as gpb

class GPBoostDataWrapper(BaseEstimator, ClassifierMixin):

    def __init__(self):
        self.needs_covariates = True
        # self.gpmodel = gpb.GPModel(likelihood="gaussian")
        self.gpboost = gpb.GPBoostRegressor()

    def fit(self, X, y, **kwargs):
        if "clusters" in kwargs:
            clst = pd.Series(kwargs["clusters"])
            gpmodel = gpb.GPModel(likelihood="gaussian", group_data=clst)
            self.gpboost.fit(X, y, gp_model=gpmodel)
        else:
            raise NotImplementedError("GPBoost needs clusters")
        return self

    def predict(self, X, **kwargs):
        clst = pd.Series(kwargs["clusters"])
        preds = self.gpboost.predict(X, group_data_pred=clst)
        preds = preds["response_mean"]
        return preds

    def save(self):
        return None

def get_gpboost_pipe(pipe_name, project_folder, split="group"):

    if split == "group":
        outercv = GroupKFold(n_splits=10)
    else:
        outercv = KFold(n_splits=10)

    my_pipe = Hyperpipe(pipe_name,
                        optimizer='grid_search',
                        metrics=['mean_absolute_error', 'mean_squared_error',
                                 'spearman_correlation', 'pearson_correlation'],
                        best_config_metric='mean_absolute_error',
                        outer_cv=outercv,
                        inner_cv=KFold(n_splits=10),
                        calculate_metrics_across_folds=True,
                        use_test_set=True,
                        verbosity=1,
                        project_folder=project_folder)

    # Add transformer elements
    my_pipe += PipelineElement("StandardScaler", hyperparameters={},
                               test_disabled=True, with_mean=True, with_std=True)

    my_pipe += PipelineElement.create("GPBoost", GPBoostDataWrapper(), hyperparameters={})

    return my_pipe

def get_mock_data():

    X = np.random.randint(10, size=(200, 9))
    y = np.sum(X, axis=1)
    clst = np.random.randint(10, size=200)

    return X, y, clst

X, y, clst = get_mock_data()

# define project folder
project_folder = "/tmp/gpboost_debug"

my_pipe = get_gpboost_pipe("Test_gpboost", project_folder, split="random")
my_pipe.fit(X, y, clusters=clst)
fabsig commented 1 year ago

I get an error AttributeError: 'Hyperpipe' object has no attribute 'best_config_metric' when trying to run your code (photonai version 2.4.0).

In addition, can you please also provide code that shows you calculate your mean absolute error and compare it to a "dummy estimator".

VHolstein commented 1 year ago

I just updated to photonai version 2.4.0 and it still works with the code above. Previously I was running it with version 2.3.0 - I also checked the source code for the Hyperpipe, and it seems to have a best_config_metric attribute (https://github.com/wwu-mmll/photonai/blob/main/photonai/base/hyperpipe.py; line 282). Maybe testing it with version 2.3.0 will work better?

The photonai package calculates the mean absolute error and runs the dummy estimator. It's automatically done by the Hyperpipe. You select your metrics and best_config_metric - if these are regression metrics the dummy estimator will guess the mean, if these are classification metrics the dummy estimator will guess the most common value.

fabsig commented 1 year ago

I am still getting the same error with photonai version 2.3.0.

Traceback (most recent call last):

  File "C:\ProgramData\Anaconda3\lib\site-packages\IPython\core\formatters.py", line 345, in __call__
    return method()

  File "C:\ProgramData\Anaconda3\lib\site-packages\sklearn\base.py", line 625, in _repr_html_inner
    return estimator_html_repr(self)

  File "C:\ProgramData\Anaconda3\lib\site-packages\sklearn\utils\_estimator_html_repr.py", line 417, in estimator_html_repr
    _write_estimator_html(

  File "C:\ProgramData\Anaconda3\lib\site-packages\sklearn\utils\_estimator_html_repr.py", line 151, in _write_estimator_html
    est_block = _get_visual_block(estimator)

  File "C:\ProgramData\Anaconda3\lib\site-packages\sklearn\utils\_estimator_html_repr.py", line 127, in _get_visual_block
    for key, est in estimator.get_params(deep=False).items()

  File "C:\ProgramData\Anaconda3\lib\site-packages\sklearn\base.py", line 211, in get_params
    value = getattr(self, key)

AttributeError: 'Hyperpipe' object has no attribute 'best_config_metric'

Out[1]: Hyperpipe(name='Test_gpboost')
RLeenings commented 1 year ago

Dear fabsig, the error you are experiencing is because apparently you are using a Jupyter Notebook. The fit method returns self, aka a hyperpipe object, and it tries to print it. Please change the last line of code to : _ = my_pipe.fit(X, y, clusters=clst)

VHolstein commented 1 year ago

Dear fabsig, I have adapted the code to test it out of the photonai package and it still poses the same error. Maybe I'm using the GPBoost algorithm the wrong way?

from sklearn.base import BaseEstimator, ClassifierMixin
import numpy as np
import pandas as pd
import gpboost as gpb

class GPBoostDataWrapper(BaseEstimator, ClassifierMixin):

    def __init__(self):
        self.needs_covariates = True
        # self.gpmodel = gpb.GPModel(likelihood="gaussian")
        self.gpboost = gpb.GPBoostRegressor()

    def fit(self, X, y, **kwargs):
        if "clusters" in kwargs:
            clst = pd.Series(kwargs["clusters"])
            gpmodel = gpb.GPModel(likelihood="gaussian", group_data=clst)
            self.gpboost.fit(X, y, gp_model=gpmodel)
        else:
            raise NotImplementedError("GPBoost needs clusters")
        return self

    def predict(self, X, **kwargs):
        if "clusters" in kwargs:
            clst = pd.Series(kwargs["clusters"])
            preds = self.gpboost.predict(X, group_data_pred=clst)
            preds = preds["response_mean"]
            return preds
        else:
            raise NotImplementedError("GPBoost needs clusters")

    def save(self):
        return None

def get_mock_data():

    X = np.random.randint(10, size=(200, 9))
    y = np.sum(X, axis=1)
    clst = np.random.randint(10, size=200)

    return X, y, clst

X_train, y_train, clst_train = get_mock_data()
X_test, y_test, clst_test = get_mock_data()

model = GPBoostDataWrapper()

model.fit(X_train, y_train, clusters=clst_train)

y_pred = model.predict(X_test, clusters=clst_test)

print(np.sum(np.absolute((y_test - y_pred))))
VHolstein commented 1 year ago

I've also added an even simpler test case that illustrates the same problem.

import numpy as np
import pandas as pd
import gpboost as gpb

def get_mock_data():

    X = np.random.randint(10, size=(200, 9))
    y = np.sum(X, axis=1)
    clst = np.random.randint(10, size=200)

    return X, y, clst

# get train/test data
X_train, y_train, clst_train = get_mock_data()
X_test, y_test, clst_test = get_mock_data()

gp_model = gpb.GPModel(group_data=clst_train, likelihood="gaussian")
bst = gpb.GPBoostRegressor()
bst.fit(X_train, y_train, gp_model=gp_model)

preds = bst.predict(X_test, group_data_pred=clst_test)
print(np.sum(np.absolute(y_test - preds["response_mean"])) / len(y))
fabsig commented 1 year ago

Thanks for the example.

The reason is that you do not choose tuning parameters, but use default values. You must always appropriately choose tuning parameters for every data set when applying boosting. There are no universal "default values". The script below shows how this can be done in your example. I obtain a much smaller MAE. Note that in this example, you simulate from a linear model with a sample size of only n=200. It is thus expected that a linear model performs better than a tree-boosting model (the difference will become smaller for larger sample sizes when the true model linear).

gp_model = gpb.GPModel(group_data=clst_train, likelihood="gaussian")

# Choose tuning parameters
param_grid = {'learning_rate': [1,0.1,0.01], 
              'min_data_in_leaf': [10,100,1000],
              'max_depth': [1,2,3,5,10],
              'lambda_l2': [0,1,10]}
other_params = {'num_leaves': 2**10, 'verbose': 0}
gp_model = gpb.GPModel(group_data=clst_train, likelihood="gaussian")
data_train = gpb.Dataset(data=X_train, label=y_train)
opt_params = gpb.grid_search_tune_parameters(param_grid=param_grid, params=other_params,
                                             num_try_random=None, nfold=4, seed=1000,
                                             train_set=data_train, gp_model=gp_model,
                                             use_gp_model_for_validation=True, verbose_eval=1,
                                             num_boost_round=1000, early_stopping_rounds=10,
                                             metric = "mae") 
# Best parameters:                                   
# ***** New best test score (l1 = 0.9031159556358567) found for the following parameter combination:
# {'learning_rate': 1.0, 'min_data_in_leaf': 10, 'max_depth': 1, 'lambda_l2': 1, 'num_boost_round': 642}

# Training and prediction
bst = gpb.GPBoostRegressor(n_estimators=642, learning_rate=1, num_leaves=2**10,
                           max_depth=1, min_data_in_leaf=10, reg_lambda=1)
bst.fit(X_train, y_train, gp_model=gp_model)

preds = bst.predict(X_test, group_data_pred=clst_test)
print(np.sum(np.absolute(y_test - preds["response_mean"])))
# 189.18370198237892