abess-team / abess

Fast Best-Subset Selection Library
https://abess.readthedocs.io/
Other
471 stars 41 forks source link

memory out when combining abess with auto-sklearn for classification. #452

Open belzheng opened 1 year ago

belzheng commented 1 year ago

Describe the bug I'm doing some experiments about combining abess with auto-sklearn, when using MultinomialRegression for classification, the memory tends to increase very quickly and so much that it cannot be displayed on a web page, but for LinearRegression, there is no similar out-of-memory problem.

Code for Reproduction

My code is given as follows:

from pprint import pprint

from ConfigSpace.configuration_space import ConfigurationSpace
from ConfigSpace.hyperparameters import CategoricalHyperparameter, \
    UniformIntegerHyperparameter, UniformFloatHyperparameter

import sklearn.metrics
import autosklearn.classification
import autosklearn.pipeline.components.classification
from autosklearn.pipeline.components.base \
    import AutoSklearnClassificationAlgorithm
from autosklearn.pipeline.constants import DENSE, SIGNED_DATA, UNSIGNED_DATA, \
    PREDICTIONS

from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
import openml
from abess import MultinomialRegression
from sklearn.ensemble import RandomForestClassifier
import numpy as np
import pandas as pd
from sklearn.datasets import fetch_openml
from sklearn import preprocessing
from sklearn.tree import DecisionTreeClassifier

import time
import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings("ignore")

class AbessClassifier(AutoSklearnClassificationAlgorithm):

    def __init__(self, exchange_num, random_state=None):
        self.exchange_num = exchange_num
        self.random_state = random_state
        self.estimator = None

    def fit(self, X, y):
        from abess import MultinomialRegression
        self.estimator = MultinomialRegression()
        self.estimator.fit(X, y)
        return self

    def predict(self, X):
        if self.estimator is None:
            raise NotImplementedError
        return self.estimator.predict(X)

    def predict_proba(self, X):
        if self.estimator is None:
            raise NotImplementedError()
        return self.estimator.predict_proba(X)

    @staticmethod
    def get_properties(dataset_properties=None):
        return {
            'shortname': 'abess Classifier',
            'name': 'abess logistic Classifier',
            'handles_regression': False,
            'handles_classification': True,
            'handles_multiclass': True,
            'handles_multilabel': False,
            'handles_multioutput': False,
            'is_deterministic': False,
            # Both input and output must be tuple(iterable)
            'input': [DENSE, SIGNED_DATA, UNSIGNED_DATA],
            'output': [PREDICTIONS]
        }

    @staticmethod
    def get_hyperparameter_search_space(dataset_properties=None):
        cs = ConfigurationSpace() 
        exchange_num=UniformIntegerHyperparameter(
            name='exchange_num', lower=4, upper=6, default_value=5
        )
        cs.add_hyperparameters([exchange_num])
        return cs

# Add abess logistic classifier component to auto-sklearn.
autosklearn.pipeline.components.classification.add_classifier(AbessClassifier)
cs = AbessClassifier.get_hyperparameter_search_space()
print(cs)

dataset = fetch_openml(data_id = int(29),as_frame=True)#507,183,44136
X=dataset.data
y=dataset.target
X.replace([np.inf,-np.inf],np.NaN,inplace=True)
## Remove rows with NaN or Inf values
inx=X[X.isna().values==True].index.unique()
X.drop(inx,inplace=True)
y.drop(inx,inplace=True)
##use dummy variables to replace classification variables:
X = pd.get_dummies(X)
## Keep only numeric columns
X = X.select_dtypes(np.number)
## Remove columns with NaN or Inf values
nan = np.isnan(X).any()[np.isnan(X).any() == True]
inf = np.isinf(X).any()[np.isinf(X).any() == True]
X = X.drop(columns = list(nan.index))
X = X.drop(columns = list(inf.index))
##Encode target labels with value between 0 and 1
le = preprocessing.LabelEncoder()
y = le.fit_transform(y)
X_train, X_test, y_train, y_test = train_test_split(X, y)
print(X_train.shape) #number of initial features
print(X_test.shape) #number of initial features

cls = autosklearn.classification.AutoSklearnClassifier(
    time_left_for_this_task=60,
    per_run_time_limit=10,
    include={
            'classifier': ['AbessClassifier'],
            'feature_preprocessor': ['polynomial']
        },
    memory_limit=6144,
    ensemble_size=1,
)
cls.fit(X_train, y_train, X_test, y_test)
predictions = cls.predict(X_test)
print("Accuracy score", sklearn.metrics.accuracy_score(y_test, predictions))

After running this code , the memory gets to about 159MB, which is not friendly for users to open an .ipynb. Again, regression does not encounter the memory-out problem.

Mamba413 commented 9 months ago

@belzheng , does it still appear in the latest abess python package?