Closed VHolstein closed 1 year ago
Thanks for using GPBoost!
Can you please provide a reproducible example?
Yes sure! Sorry for not providing this earlier. I made a mock example, very similar to the data I was using to debug my code. The model produces a mean absolute error quite similar to the Dummy Estimator (which just guesses the mean for every sample). In theory the model should be able to esimate the target accurately as it is just the sum of all the values in a row. I'm sure I'm making an error in the way I use your code, I just have not been able to figure it out.
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.model_selection import GroupKFold, KFold
from photonai.base import Hyperpipe, PipelineElement
import numpy as np
import pandas as pd
import gpboost as gpb
class GPBoostDataWrapper(BaseEstimator, ClassifierMixin):
def __init__(self):
self.needs_covariates = True
# self.gpmodel = gpb.GPModel(likelihood="gaussian")
self.gpboost = gpb.GPBoostRegressor()
def fit(self, X, y, **kwargs):
if "clusters" in kwargs:
clst = pd.Series(kwargs["clusters"])
gpmodel = gpb.GPModel(likelihood="gaussian", group_data=clst)
self.gpboost.fit(X, y, gp_model=gpmodel)
else:
raise NotImplementedError("GPBoost needs clusters")
return self
def predict(self, X, **kwargs):
clst = pd.Series(kwargs["clusters"])
preds = self.gpboost.predict(X, group_data_pred=clst)
preds = preds["response_mean"]
return preds
def save(self):
return None
def get_gpboost_pipe(pipe_name, project_folder, split="group"):
if split == "group":
outercv = GroupKFold(n_splits=10)
else:
outercv = KFold(n_splits=10)
my_pipe = Hyperpipe(pipe_name,
optimizer='grid_search',
metrics=['mean_absolute_error', 'mean_squared_error',
'spearman_correlation', 'pearson_correlation'],
best_config_metric='mean_absolute_error',
outer_cv=outercv,
inner_cv=KFold(n_splits=10),
calculate_metrics_across_folds=True,
use_test_set=True,
verbosity=1,
project_folder=project_folder)
# Add transformer elements
my_pipe += PipelineElement("StandardScaler", hyperparameters={},
test_disabled=True, with_mean=True, with_std=True)
my_pipe += PipelineElement.create("GPBoost", GPBoostDataWrapper(), hyperparameters={})
return my_pipe
def get_mock_data():
X = np.random.randint(10, size=(200, 9))
y = np.sum(X, axis=1)
clst = np.random.randint(10, size=200)
return X, y, clst
X, y, clst = get_mock_data()
# define project folder
project_folder = "/tmp/gpboost_debug"
my_pipe = get_gpboost_pipe("Test_gpboost", project_folder, split="random")
my_pipe.fit(X, y, clusters=clst)
I get an error AttributeError: 'Hyperpipe' object has no attribute 'best_config_metric'
when trying to run your code (photonai version 2.4.0).
In addition, can you please also provide code that shows you calculate your mean absolute error and compare it to a "dummy estimator".
I just updated to photonai version 2.4.0 and it still works with the code above. Previously I was running it with version 2.3.0 - I also checked the source code for the Hyperpipe, and it seems to have a best_config_metric attribute (https://github.com/wwu-mmll/photonai/blob/main/photonai/base/hyperpipe.py; line 282). Maybe testing it with version 2.3.0 will work better?
The photonai package calculates the mean absolute error and runs the dummy estimator. It's automatically done by the Hyperpipe. You select your metrics and best_config_metric - if these are regression metrics the dummy estimator will guess the mean, if these are classification metrics the dummy estimator will guess the most common value.
I am still getting the same error with photonai version 2.3.0.
Traceback (most recent call last):
File "C:\ProgramData\Anaconda3\lib\site-packages\IPython\core\formatters.py", line 345, in __call__
return method()
File "C:\ProgramData\Anaconda3\lib\site-packages\sklearn\base.py", line 625, in _repr_html_inner
return estimator_html_repr(self)
File "C:\ProgramData\Anaconda3\lib\site-packages\sklearn\utils\_estimator_html_repr.py", line 417, in estimator_html_repr
_write_estimator_html(
File "C:\ProgramData\Anaconda3\lib\site-packages\sklearn\utils\_estimator_html_repr.py", line 151, in _write_estimator_html
est_block = _get_visual_block(estimator)
File "C:\ProgramData\Anaconda3\lib\site-packages\sklearn\utils\_estimator_html_repr.py", line 127, in _get_visual_block
for key, est in estimator.get_params(deep=False).items()
File "C:\ProgramData\Anaconda3\lib\site-packages\sklearn\base.py", line 211, in get_params
value = getattr(self, key)
AttributeError: 'Hyperpipe' object has no attribute 'best_config_metric'
Out[1]: Hyperpipe(name='Test_gpboost')
Dear fabsig,
the error you are experiencing is because apparently you are using a Jupyter Notebook.
The fit method returns self, aka a hyperpipe object, and it tries to print it.
Please change the last line of code to :
_ = my_pipe.fit(X, y, clusters=clst)
Dear fabsig, I have adapted the code to test it out of the photonai package and it still poses the same error. Maybe I'm using the GPBoost algorithm the wrong way?
from sklearn.base import BaseEstimator, ClassifierMixin
import numpy as np
import pandas as pd
import gpboost as gpb
class GPBoostDataWrapper(BaseEstimator, ClassifierMixin):
def __init__(self):
self.needs_covariates = True
# self.gpmodel = gpb.GPModel(likelihood="gaussian")
self.gpboost = gpb.GPBoostRegressor()
def fit(self, X, y, **kwargs):
if "clusters" in kwargs:
clst = pd.Series(kwargs["clusters"])
gpmodel = gpb.GPModel(likelihood="gaussian", group_data=clst)
self.gpboost.fit(X, y, gp_model=gpmodel)
else:
raise NotImplementedError("GPBoost needs clusters")
return self
def predict(self, X, **kwargs):
if "clusters" in kwargs:
clst = pd.Series(kwargs["clusters"])
preds = self.gpboost.predict(X, group_data_pred=clst)
preds = preds["response_mean"]
return preds
else:
raise NotImplementedError("GPBoost needs clusters")
def save(self):
return None
def get_mock_data():
X = np.random.randint(10, size=(200, 9))
y = np.sum(X, axis=1)
clst = np.random.randint(10, size=200)
return X, y, clst
X_train, y_train, clst_train = get_mock_data()
X_test, y_test, clst_test = get_mock_data()
model = GPBoostDataWrapper()
model.fit(X_train, y_train, clusters=clst_train)
y_pred = model.predict(X_test, clusters=clst_test)
print(np.sum(np.absolute((y_test - y_pred))))
I've also added an even simpler test case that illustrates the same problem.
import numpy as np
import pandas as pd
import gpboost as gpb
def get_mock_data():
X = np.random.randint(10, size=(200, 9))
y = np.sum(X, axis=1)
clst = np.random.randint(10, size=200)
return X, y, clst
# get train/test data
X_train, y_train, clst_train = get_mock_data()
X_test, y_test, clst_test = get_mock_data()
gp_model = gpb.GPModel(group_data=clst_train, likelihood="gaussian")
bst = gpb.GPBoostRegressor()
bst.fit(X_train, y_train, gp_model=gp_model)
preds = bst.predict(X_test, group_data_pred=clst_test)
print(np.sum(np.absolute(y_test - preds["response_mean"])) / len(y))
Thanks for the example.
The reason is that you do not choose tuning parameters, but use default values. You must always appropriately choose tuning parameters for every data set when applying boosting. There are no universal "default values". The script below shows how this can be done in your example. I obtain a much smaller MAE. Note that in this example, you simulate from a linear model with a sample size of only n=200. It is thus expected that a linear model performs better than a tree-boosting model (the difference will become smaller for larger sample sizes when the true model linear).
gp_model = gpb.GPModel(group_data=clst_train, likelihood="gaussian")
# Choose tuning parameters
param_grid = {'learning_rate': [1,0.1,0.01],
'min_data_in_leaf': [10,100,1000],
'max_depth': [1,2,3,5,10],
'lambda_l2': [0,1,10]}
other_params = {'num_leaves': 2**10, 'verbose': 0}
gp_model = gpb.GPModel(group_data=clst_train, likelihood="gaussian")
data_train = gpb.Dataset(data=X_train, label=y_train)
opt_params = gpb.grid_search_tune_parameters(param_grid=param_grid, params=other_params,
num_try_random=None, nfold=4, seed=1000,
train_set=data_train, gp_model=gp_model,
use_gp_model_for_validation=True, verbose_eval=1,
num_boost_round=1000, early_stopping_rounds=10,
metric = "mae")
# Best parameters:
# ***** New best test score (l1 = 0.9031159556358567) found for the following parameter combination:
# {'learning_rate': 1.0, 'min_data_in_leaf': 10, 'max_depth': 1, 'lambda_l2': 1, 'num_boost_round': 642}
# Training and prediction
bst = gpb.GPBoostRegressor(n_estimators=642, learning_rate=1, num_leaves=2**10,
max_depth=1, min_data_in_leaf=10, reg_lambda=1)
bst.fit(X_train, y_train, gp_model=gp_model)
preds = bst.predict(X_test, group_data_pred=clst_test)
print(np.sum(np.absolute(y_test - preds["response_mean"])))
# 189.18370198237892
I'm currently using your package inside of another python package (photonai) to build a model pipeline. This is done to run a number of different models through the same pipeline. I have integrated your GPBoostRegressor as follows below. For testing purposes I'm training this model on a simple dataframe where X is a vector of numbers and y is the sum of these numbers. I was expecting a mean of absolute error of around 0, but this is not the case. Any suggestions what I might be doing wrong when integrating your model like this?