Closed jonathan-f closed 1 year ago
Hello, I modified the class PipelineHelper to keep also the selected features and the feature importance (if present), I provide the code that could be useful for other users.
With the new definition of the class and the same pipeline and search space as in my previous comment, after running the grid search CV
gs = GridSearchCV(pipe, search_space, cv=5, verbose=3,n_jobs=-1,scoring = 'roc_auc')
gs = gs.fit(X, y)
it is possible to get the Boolean vector of the selected features as done usually in sklearn
gs.best_estimator_.named_steps['selector'].get_support()]
and the feature importance can be obtained in the following way:
if hasattr(gs.best_estimator_.named_steps['classifier'], 'feature_importances'):
feature_importances = gs.best_estimator_.named_steps['classifier'].feature_importances
Here is the code for the new class:
class PipelineHelper(BaseEstimator, TransformerMixin, ClassifierMixin):
"""
This class can be used in scikit pipelines to select elements.
In addition to the "replace_estimator" functionality of scikit itself,
this class allows to set specified parameters for each option in the list.
"""
def __init__(
self,
available_models=None,
selected_model=None,
include_bypass=False,
optional=False,
):
"""
Selects elements from a list to use as estimators in a pipeline.
Args:
available_models: a list of models which should be selected from.
If you have time on your hands, please enable the use of
pipelines here.
selected_model: this parameter is required for the clone operation
used by gridsearch. It should only be used initially if no grid
search is used.
optional: if set to true, one of the resulting configurations will
have this stage empty.
"""
self.optional = optional
# this is required for the clone operator used in gridsearch
self.selected_model = selected_model
self.selected_features = None
# cloned
if type(available_models) == dict:
self.available_models = available_models
else:
# manually initialized
self.available_models = {}
for (key, model) in available_models:
self.available_models[key] = model
def generate(self, param_dict=None):
"""
Generates the parameters that are required for a gridsearch.
Args:
param_dict: parameters for the available models provided in the
constructor. Note that these don't require the prefix path of
all elements higher up the hierarchy of this TransformerPicker.
"""
if param_dict is None:
param_dict = dict()
per_model_parameters = defaultdict(lambda: defaultdict(list))
# collect parameters for each specified model
for k, values in param_dict.items():
# example: randomforest__n_estimators
model_name = k.split('__')[0]
param_name = k[len(model_name) + 2:]
if model_name not in self.available_models:
raise Exception('no such model: {0}'.format(model_name))
per_model_parameters[model_name][param_name] = values
ret = []
# create instance for cartesion product of all available parameters
# for each model
for model_name, param_dict in per_model_parameters.items():
parameter_sets = ParameterGrid(param_dict)
for parameters in parameter_sets:
ret.append((model_name, parameters))
# for every model that has no specified parameters, add default value
for model_name in self.available_models.keys():
if model_name not in per_model_parameters:
ret.append((model_name, dict()))
if self.optional:
ret.append((None, dict()))
return ret
def get_support(self):
if self.selected_features is None:
raise ValueError('Selected features are not available.')
return self.selected_features
def get_params(self, deep=True):
"""
Returns the parameters of the current TransformerPicker instance.
Note that this is different from the parameters used by the selected
model. Provided for scikit estimator compatibility.
"""
result = {
'available_models': self.available_models,
'selected_model': self.selected_model,
'optional': self.optional,
}
if deep and self.selected_model:
result.update({
'selected_model__' + k: v
for k, v in self.selected_model.get_params(deep=True).items()
})
if deep and self.available_models:
for name, model in self.available_models.items():
result['available_models__' + name] = model
result.update({
'available_models__' + name + '__' + k: v
for k, v in model.get_params(deep=True).items()
})
return result
@property
def transformer_list(self):
"""
Returns a list of all available models.
Provided for scikit estimator compatibility.
"""
return self.available_models
def set_params(self,
selected_model,
available_models=None,
optional=False):
"""
Sets the parameters to all available models.
Provided for scikit estimator compatibility.
"""
if available_models:
self.available_models = available_models
if selected_model[0] is None:
self.selected_model = None
else:
if selected_model[0] not in self.available_models:
raise ValueError(
'trying to set selected model {selected_model[0]}, which '
f'is not in the available models {available_models}.'
)
self.selected_model = self.available_models[selected_model[0]]
if self.selected_model is not None:
self.selected_model.set_params(**selected_model[1])
return self
def fit(self, x, y=None, **kwargs):
"""Fits the selected model."""
if self.selected_model is None or self.selected_model == 'passthrough':
return self
else:
if hasattr(self.selected_model, 'get_support'):
self.selected_model.fit(x, y, **kwargs)
self.selected_features = self.selected_model.get_support()
else:
self.selected_model.fit(x, y, **kwargs)
self.selected_features = None
if hasattr(self.selected_model, 'feature_importances_'):
self.feature_importances = self.selected_model.feature_importances_
else:
self.feature_importances = None
return self
def transform(self, x, *args, **kwargs):
"""Transforms data with the selected model."""
if self.selected_model is None or self.selected_model == 'passthrough':
return x
else:
return self.selected_model.transform(x, *args, **kwargs)
def predict(self, x):
"""Predicts data with the selected model."""
if self.optional:
raise ValueError('a classifier cannot be optional')
return self.selected_model.predict(x)
@if_delegate_has_method(delegate='selected_model')
def predict_proba(self, x):
return self.selected_model.predict_proba(x)
@if_delegate_has_method(delegate='selected_model')
def decision_function(self, x):
return self.selected_model.decision_function(x)
@property
def classes_(self):
if hasattr(self.selected_model, 'classes_'):
return self.selected_model.classes_
raise ValueError('selected model does not provide classes_')
Best wishes, Jonathan
Thank you for your feedback.
I have updated the class to support get_support()
by using the @if_delegate_has_method
decorator.
Also, I have updated the README - this project will be deprecated soon, and the native scikit-learn methods should be preferred.
Hi, Thanks for the great package! I am using PipelineHelper to build a Pipeline that I give in input to grid search CV, here an example code:
The grid search CV works fine, then I want to get the features that are selected by the best estimator, using the command
clf1.best_estimator_.named_steps['selector'].get_support()
but I get this error: AttributeError: 'PipelineHelper' object has no attribute 'get_support'Is there a way to extract the selected features from a PipelineHelper object?
Thank you, Best wishes Jonathan