csinva / imodels

Interpretable ML package 🔍 for concise, transparent, and accurate predictive modeling (sklearn-compatible).
https://csinva.io/imodels
MIT License
1.4k stars 124 forks source link

Bugs about fit SLIMClassifier, BayesianRuleSetClassifier, SlipperClassifier, TaoTreeRegressor #186

Open jckkvs opened 1 year ago

jckkvs commented 1 year ago

I tested the fit functions of almost all classifiers and regressors. As a result of the tests, the following four models: SLIMClassifier, BayesianRuleSetClassifier, SlipperClassifier, and TaoTreeRegressor, failed to fit.

import pytest
from imodels import *
from sklearn.datasets import make_regression, make_classification

classifiers = [
    SLIMClassifier(),
    OptimalRuleListClassifier(),
    GreedyRuleListClassifier(),
    OneRClassifier(),
    BoostedRulesClassifier(),
    BayesianRuleSetClassifier(),
    RuleFitClassifier(),
    SkopeRulesClassifier(),
    SlipperClassifier(),
    C45TreeClassifier(),
    GreedyTreeClassifier(),
    FIGSClassifier(),
    FIGSClassifierCV(),
    HSTreeClassifier(),
    HSTreeClassifierCV(),
    TaoTreeClassifier(),
]

# not test
classifiers_for_discretized_X = [
    BayesianRuleListClassifier(),
    FPLassoClassifier(),
    FPSkopeClassifier(),
]

regressors = [
    SLIMRegressor(),
    BoostedRulesRegressor(),
    RuleFitRegressor(),
    GreedyTreeRegressor(),
    FIGSRegressor(),
    FIGSRegressorCV(),
    HSTreeRegressor(),
    HSTreeRegressorCV(),
    TaoTreeRegressor(),
]

# not test
regressors_for_discretized_X = [
    FPLassoRegressor(),
]

@pytest.mark.parametrize("classifier", classifiers)
def test_fit_classifier(classifier) -> None:
    X, y = make_classification(n_samples=25, n_features=5)
    classifier.fit(X, y)

@pytest.mark.parametrize("regressor", regressors)
def test_fit_regressor(regressor) -> None:
    X, y = make_regression(n_samples=25, n_features=5)
    regressor.fit(X, y)

Here are the test results

_____________________________________________ test_fit_classifier[classifier1] ______________________________________________

classifier = CorelsClassifier ({'c': 0.01, 'n_iter': 10000, 'map_type': 'prefix', 'policy': 'lower_bound', 'verbosity': [], 'ablation': 0, 'max_card': 2, 'min_support': 0.01})

    @pytest.mark.parametrize("classifier", classifiers)
    def test_fit_classifier(classifier) -> None:
        X, y = make_classification(n_samples=25, n_features=5)
>       classifier_ = clone(classifier)

tests\test_fit_print.py:52:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _

estimator = CorelsClassifier ({'c': 0.01, 'n_iter': 10000, 'map_type': 'prefix', 'policy': 'lower_bound', 'verbosity': [], 'ablation': 0, 'max_card': 2, 'min_support': 0.01})

    def clone(estimator, *, safe=True):
        """Construct a new unfitted estimator with the same parameters.

        Clone does a deep copy of the model in an estimator
        without actually copying attached data. It returns a new estimator
        with the same parameters that has not been fitted on any data.

        Parameters
        ----------
        estimator : {list, tuple, set} of estimator instance or a single \
                estimator instance
            The estimator or group of estimators to be cloned.
        safe : bool, default=True
            If safe is False, clone will fall back to a deep copy on objects
            that are not estimators.

        Returns
        -------
        estimator : object
            The deep copy of the input, an estimator if input is an estimator.

        Notes
        -----
        If the estimator's `random_state` parameter is an integer (or if the
        estimator doesn't have a `random_state` parameter), an *exact clone* is
        returned: the clone and the original estimator will give the exact same
        results. Otherwise, *statistical clone* is returned: the clone might
        return different results from the original estimator. More details can be
        found in :ref:`randomness`.
        """
        estimator_type = type(estimator)
        # XXX: not handling dictionaries
        if estimator_type in (list, tuple, set, frozenset):
            return estimator_type([clone(e, safe=safe) for e in estimator])
        elif not hasattr(estimator, "get_params") or isinstance(estimator, type):
            if not safe:
                return copy.deepcopy(estimator)
            else:
                if isinstance(estimator, type):
                    raise TypeError(
                        "Cannot clone object. "
                        + "You should provide an instance of "
                        + "scikit-learn estimator instead of a class."
                    )
                else:
                    raise TypeError(
                        "Cannot clone object '%s' (type %s): "
                        "it does not seem to be a scikit-learn "
                        "estimator as it does not implement a "
                        "'get_params' method." % (repr(estimator), type(estimator))
                    )

        klass = estimator.__class__
>       new_object_params = estimator.get_params(deep=False)
E       TypeError: CorelsClassifier.get_params() got an unexpected keyword argument 'deep'

..\..\..\Anaconda3\envs\py310\lib\site-packages\sklearn\base.py:87: TypeError
_____________________________________________ test_fit_classifier[classifier5] ______________________________________________

classifier = BayesianRuleSetClassifier()

    @pytest.mark.parametrize("classifier", classifiers)
    def test_fit_classifier(classifier) -> None:
        X, y = make_classification(n_samples=25, n_features=5)
        classifier_ = clone(classifier)
>       classifier_.fit(X, y)

tests\test_fit_print.py:53:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _

self = BayesianRuleSetClassifier(alpha_l=[10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10],
                          beta_l=[-0.0...
                                  25200.0, 21000.0, 12000.0, 4500.0, 1000.0,
                                  100.0])
X =           X0        X1        X2        X3        X4
0   1.856723 -0.676788 -2.081929  0.139416  1.345762
1   0.942105...948  2.517298
23 -0.556286 -2.165002 -0.522723  1.466807 -0.796446
24 -1.369548  1.188797 -0.544919 -0.542191 -0.878127
y = array([1., 1., 1., 0., 0., 1., 1., 1., 0., 0., 0., 0., 0., 1., 1., 0., 1.,
       1., 0., 0., 0., 1., 1., 0., 0.])
feature_names = ['X0', 'X1', 'X2', 'X3', 'X4'], init = [], verbose = False

    def fit(self, X, y, feature_names: list = None, init=[], verbose=False):
        '''
        Parameters
        ----------
        X : array-like, shape = [n_samples, n_features]
            Training data
        y : array_like, shape = [n_samples]
            Labels

        feature_names : array_like, shape = [n_features], optional (default: [])
            String labels for each feature.
            If empty and X is a DataFrame, column labels are used.
            If empty and X is not a DataFrame, then features are simply enumerated
        '''
        # check inputs
        self.attr_level_num = defaultdict(int)  # any missing value defaults to 0
        self.attr_names = []

        X, y, feature_names = check_fit_arguments(self, X, y, feature_names)
        np.random.seed(self.random_state)

        # convert to pandas DataFrame
        X = pd.DataFrame(X, columns=feature_names)

        for i, name in enumerate(X.columns):
            self.attr_level_num[name] += 1
            self.attr_names.append(name)
        self.attr_names_orig = deepcopy(self.attr_names)
        self.attr_names = list(set(self.attr_names))

        # set up patterns
        self._set_pattern_space()

        # parameter checking
        if self.alpha_l is None or self.beta_l is None or len(self.alpha_l) != self.maxlen or len(
                self.beta_l) != self.maxlen:
            if verbose:
                print('No or wrong input for alpha_l and beta_l - the model will use default parameters.')
            self.C = [1.0 / self.maxlen] * self.maxlen
            self.C.insert(0, -1)
            self.alpha_l = [10] * (self.maxlen + 1)
            self.beta_l = [10 * self.pattern_space[i] / self.C[i] for i in range(self.maxlen + 1)]
        else:
            self.alpha_l = [1] + list(self.alpha_l)
            self.beta_l = [1] + list(self.beta_l)

        # setup
        self._generate_rules(X, y, verbose)
        n_rules_current = len(self.rules_)
        self.rules_len_list = [len(rule) for rule in self.rules_]
        maps = defaultdict(list)
        T0 = 1000  # initial temperature for simulated annealing
        split = 0.7 * self.num_iterations

        # run simulated annealing
        for chain in range(self.num_chains):
            # initialize with a random pattern set
            if init != []:
                rules_curr = init.copy()
            else:
>               assert n_rules_current > 1, f'Only {n_rules_current} potential rules found, change hyperparams to allow for more'
E               AssertionError: Only 0 potential rules found, change hyperparams to allow for more

imodels\rule_set\brs.py:147: AssertionError
--------------------------------------------------- Captured stdout call ----------------------------------------------------
mat.shape (25, 13626)

p1.shape (13626,) pp.shape (13626,) cond_entropy.shape
_____________________________________________ test_fit_classifier[classifier12] _____________________________________________

classifier = <imodels.tree.figs.FIGSClassifierCV object at 0x0000024E24343580>

    @pytest.mark.parametrize("classifier", classifiers)
    def test_fit_classifier(classifier) -> None:
        X, y = make_classification(n_samples=25, n_features=5)
>       classifier_ = clone(classifier)

tests\test_fit_print.py:52:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _

estimator = <imodels.tree.figs.FIGSClassifierCV object at 0x0000024E24343580>

    def clone(estimator, *, safe=True):
        """Construct a new unfitted estimator with the same parameters.

        Clone does a deep copy of the model in an estimator
        without actually copying attached data. It returns a new estimator
        with the same parameters that has not been fitted on any data.

        Parameters
        ----------
        estimator : {list, tuple, set} of estimator instance or a single \
                estimator instance
            The estimator or group of estimators to be cloned.
        safe : bool, default=True
            If safe is False, clone will fall back to a deep copy on objects
            that are not estimators.

        Returns
        -------
        estimator : object
            The deep copy of the input, an estimator if input is an estimator.

        Notes
        -----
        If the estimator's `random_state` parameter is an integer (or if the
        estimator doesn't have a `random_state` parameter), an *exact clone* is
        returned: the clone and the original estimator will give the exact same
        results. Otherwise, *statistical clone* is returned: the clone might
        return different results from the original estimator. More details can be
        found in :ref:`randomness`.
        """
        estimator_type = type(estimator)
        # XXX: not handling dictionaries
        if estimator_type in (list, tuple, set, frozenset):
            return estimator_type([clone(e, safe=safe) for e in estimator])
        elif not hasattr(estimator, "get_params") or isinstance(estimator, type):
            if not safe:
                return copy.deepcopy(estimator)
            else:
                if isinstance(estimator, type):
                    raise TypeError(
                        "Cannot clone object. "
                        + "You should provide an instance of "
                        + "scikit-learn estimator instead of a class."
                    )
                else:
>                   raise TypeError(
                        "Cannot clone object '%s' (type %s): "
                        "it does not seem to be a scikit-learn "
                        "estimator as it does not implement a "
                        "'get_params' method." % (repr(estimator), type(estimator))
                    )
E                   TypeError: Cannot clone object '<imodels.tree.figs.FIGSClassifierCV object at 0x0000024E24343580>' (type <class 'imodels.tree.figs.FIGSClassifierCV'>): it does not seem to be a scikit-learn estimator as it does not implement a 'get_params' method.

..\..\..\Anaconda3\envs\py310\lib\site-packages\sklearn\base.py:79: TypeError
______________________________________________ test_fit_regressor[regressor5] _______________________________________________

regressor = <imodels.tree.figs.FIGSRegressorCV object at 0x0000024E24343A00>

    @pytest.mark.parametrize("regressor", regressors)
    def test_fit_regressor(regressor) -> None:
        X, y = make_regression(n_samples=25, n_features=5)
>       regressor_ = clone(regressor)

tests\test_fit_print.py:59:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _

estimator = <imodels.tree.figs.FIGSRegressorCV object at 0x0000024E24343A00>

    def clone(estimator, *, safe=True):
        """Construct a new unfitted estimator with the same parameters.

        Clone does a deep copy of the model in an estimator
        without actually copying attached data. It returns a new estimator
        with the same parameters that has not been fitted on any data.

        Parameters
        ----------
        estimator : {list, tuple, set} of estimator instance or a single \
                estimator instance
            The estimator or group of estimators to be cloned.
        safe : bool, default=True
            If safe is False, clone will fall back to a deep copy on objects
            that are not estimators.

        Returns
        -------
        estimator : object
            The deep copy of the input, an estimator if input is an estimator.

        Notes
        -----
        If the estimator's `random_state` parameter is an integer (or if the
        estimator doesn't have a `random_state` parameter), an *exact clone* is
        returned: the clone and the original estimator will give the exact same
        results. Otherwise, *statistical clone* is returned: the clone might
        return different results from the original estimator. More details can be
        found in :ref:`randomness`.
        """
        estimator_type = type(estimator)
        # XXX: not handling dictionaries
        if estimator_type in (list, tuple, set, frozenset):
            return estimator_type([clone(e, safe=safe) for e in estimator])
        elif not hasattr(estimator, "get_params") or isinstance(estimator, type):
            if not safe:
                return copy.deepcopy(estimator)
            else:
                if isinstance(estimator, type):
                    raise TypeError(
                        "Cannot clone object. "
                        + "You should provide an instance of "
                        + "scikit-learn estimator instead of a class."
                    )
                else:
>                   raise TypeError(
                        "Cannot clone object '%s' (type %s): "
                        "it does not seem to be a scikit-learn "
                        "estimator as it does not implement a "
                        "'get_params' method." % (repr(estimator), type(estimator))
                    )
E                   TypeError: Cannot clone object '<imodels.tree.figs.FIGSRegressorCV object at 0x0000024E24343A00>' (type <class 'imodels.tree.figs.FIGSRegressorCV'>): it does not seem to be a scikit-learn estimator as it does not implement a 'get_params' method.

..\..\..\Anaconda3\envs\py310\lib\site-packages\sklearn\base.py:79: TypeError
______________________________________________ test_fit_regressor[regressor8] _______________________________________________

regressor = TaoTreeRegressor()

    @pytest.mark.parametrize("regressor", regressors)
    def test_fit_regressor(regressor) -> None:
        X, y = make_regression(n_samples=25, n_features=5)
        regressor_ = clone(regressor)
>       regressor_.fit(X, y)

tests\test_fit_print.py:60:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _

self = TaoTreeRegressor()
X = array([[ 1.37335581e+00,  5.41363727e-01, -1.19638750e-01,
        -1.72968975e-01,  3.49329569e-01],
       [-1.55348...3.96825794e-01],
       [ 3.04033564e-01, -1.14044020e+00,  1.57034218e-01,
         1.33012401e+00,  1.06648724e-01]])
y = array([ 103.98582366,  -83.95949067,   23.68672904, -136.32004647,
        -86.99044435,    1.25016217,   65.46450731,... 188.81385195,   86.91833492,
        -77.45621107,   34.3125444 ,   92.98970577,  -43.27341573,
         17.77429275])
feature_names = ['X0', 'X1', 'X2', 'X3', 'X4'], sample_weight = None

    def fit(self, X, y=None, feature_names=None, sample_weight=None):
        """
        Params
        ------
        _sample_weight: array-like of shape (n_samples,), default=None
            Sample weights. If None, then samples are equally weighted.
            Splits that would create child nodes with net zero or negative weight
            are ignored while searching for a split in each node.
        """
        X, y, feature_names = check_fit_arguments(self, X, y, feature_names)
        if isinstance(self, RegressorMixin):
>           raise Warning('TAO Regression is not yet tested')
E           Warning: TAO Regression is not yet tested

imodels\tree\tao.py:115: Warning
_______________________________________ test_fit_before_print_classifier[classifier1] _______________________________________

classifier = CorelsClassifier ({'c': 0.01, 'n_iter': 10000, 'map_type': 'prefix', 'policy': 'lower_bound', 'verbosity': [], 'ablation': 0, 'max_card': 2, 'min_support': 0.01})

    @pytest.mark.parametrize("classifier", classifiers)
    def test_fit_before_print_classifier(classifier) -> None:
>       print(classifier)

tests\test_fit_print.py:65:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _

self = CorelsClassifier ({'c': 0.01, 'n_iter': 10000, 'map_type': 'prefix', 'policy': 'lower_bound', 'verbosity': [], 'ablation': 0, 'max_card': 2, 'min_support': 0.01})

    def __str__(self):
        if corels_supported:
            if self.str_print is not None:
                return 'OptimalRuleList:\n\n' + self.str_print
            else:
>               return 'OptimalRuleList:\n\n' + self.rl_.__str__()
E               AttributeError: 'OptimalRuleListClassifier' object has no attribute 'rl_'

imodels\rule_list\corels_wrapper.py:240: AttributeError