bmurauer / pipelinehelper

scikit-helper to hot-swap pipeline elements
GNU General Public License v3.0
21 stars 9 forks source link

updating piplinehelper to match new version of sklearn #19

Open Mo-Khalifa96 opened 3 months ago

Mo-Khalifa96 commented 3 months ago

The pipelinehelper here depends on sklearn.utils.metaestimators.if_delegate_has_method which has since been deprecated and removed in the newer versions of sklearn (sklearn => 1.3) . Instead, sklearn.utils.metaestimators.available_if is now used. So, I have adjusted the pipelinehelper to work with the new changes.


 """Selects elements from a scikit pipeline with a working parametergrid."""

from collections import defaultdict

from sklearn.base import BaseEstimator
from sklearn.base import ClassifierMixin
from sklearn.base import TransformerMixin
from sklearn.model_selection import ParameterGrid
from sklearn.utils.metaestimators import available_if

class PipelineHelper(BaseEstimator, TransformerMixin, ClassifierMixin):
    """
    This class can be used in scikit pipelines to select elements.

    In addition to the "replace_estimator" functionality of scikit itself,
    this class allows to set specified parameters for each option in the list.
    """

    def __init__(
            self,
            available_models=None,
            selected_model=None,
            include_bypass=False,
            optional=False,
    ):
        """
        Selects elements from a list to use as estimators in a pipeline.

        Args:
            available_models: a list of models which should be selected from.
                If you have time on your hands, please enable the use of
                pipelines here.
            selected_model: this parameter is required for the clone operation
                used by gridsearch. It should only be used initially if no grid
                search is used.
            optional: if set to true, one of the resulting configurations will
                have this stage empty.
        """
        self.optional = optional
        # this is required for the clone operator used in gridsearch
        self.selected_model = selected_model

        # cloned
        if type(available_models) == dict:
            self.available_models = available_models
        else:
            # manually initialized
            self.available_models = {}
            for (key, model) in available_models:
                self.available_models[key] = model

    def generate(self, param_dict=None):
        """
        Generates the parameters that are required for a gridsearch.

        Args:
            param_dict: parameters for the available models provided in the
                constructor. Note that these don't require the prefix path of
                all elements higher up the hierarchy of this TransformerPicker.

        """
        if param_dict is None:
            param_dict = dict()
        per_model_parameters = defaultdict(lambda: defaultdict(list))

        # collect parameters for each specified model
        for k, values in param_dict.items():
            # example:  randomforest__n_estimators
            model_name = k.split('__')[0]
            param_name = k[len(model_name) + 2:]
            if model_name not in self.available_models:
                raise Exception('no such model: {0}'.format(model_name))
            per_model_parameters[model_name][param_name] = values

        ret = []

        # create instance for cartesion product of all available parameters
        # for each model
        for model_name, param_dict in per_model_parameters.items():
            parameter_sets = ParameterGrid(param_dict)
            for parameters in parameter_sets:
                ret.append((model_name, parameters))

        # for every model that has no specified parameters, add default value
        for model_name in self.available_models.keys():
            if model_name not in per_model_parameters:
                ret.append((model_name, dict()))

        if self.optional:
            ret.append((None, dict()))
        return ret

    def get_params(self, deep=True):
        """
        Returns the parameters of the current TransformerPicker instance.

        Note that this is different from the parameters used by the selected
        model. Provided for scikit estimator compatibility.
        """
        result = {
            'available_models': self.available_models,
            'selected_model': self.selected_model,
            'optional': self.optional,
        }
        if deep and self.selected_model:
            result.update({
                'selected_model__' + k: v
                for k, v in self.selected_model.get_params(deep=True).items()
            })
        if deep and self.available_models:
            for name, model in self.available_models.items():
                result['available_models__' + name] = model
                result.update({
                    'available_models__' + name + '__' + k: v
                    for k, v in model.get_params(deep=True).items()
                })
        return result

    @property
    def transformer_list(self):
        """
        Returns a list of all available models.

        Provided for scikit estimator compatibility.
        """
        return self.available_models

    def set_params(self,
                   selected_model,
                   available_models=None,
                   optional=False):
        """
        Sets the parameters to all available models.

        Provided for scikit estimator compatibility.
        """
        if available_models:
            self.available_models = available_models

        if selected_model[0] is None:
            self.selected_model = None
        else:
            if selected_model[0] not in self.available_models:
                raise ValueError(
                    'trying to set selected model {selected_model[0]}, which '
                    f'is not in the available models {available_models}.'
                )
            self.selected_model = self.available_models[selected_model[0]]
            if self.selected_model is not None:
                self.selected_model.set_params(**selected_model[1])
        return self

    def fit(self, x, y=None, **kwargs):
        """Fits the selected model."""
        if self.selected_model is None or self.selected_model == 'passthrough':
            return self
        else:
            return self.selected_model.fit(x, y, **kwargs)

    def transform(self, x, *args, **kwargs):
        """Transforms data with the selected model."""
        if self.selected_model is None or self.selected_model == 'passthrough':
            return x
        else:
            return self.selected_model.transform(x, *args, **kwargs)

    def predict(self, x):
        """Predicts data with the selected model."""
        if self.optional:
            raise ValueError('a classifier can not be optional')
        return self.selected_model.predict(x)

    @available_if('selected_model')
    def predict_proba(self, x):
        return self.selected_model.predict_proba(x)

    @available_if('selected_model')
    def decision_function(self, x):
        return self.selected_model.decision_function(x)

    @property
    def classes_(self):
        if hasattr(self.selected_model, 'classes_'):
            return self.selected_model.classes_
        raise ValueError('selected model does not provide classes_')