sapientml / core

A SapientML plugin of SapientMLGenerator
Apache License 2.0
10 stars 12 forks source link

A ValueError occurs during hyperparameter tuning in the candidate script using GradientBoosting #64

Open mariko-sugawara opened 8 months ago

mariko-sugawara commented 8 months ago

Describe the bug If the hyperparameter 'loss' is' exponential 'in the GradientBoostingClassifier, the AdaBoost algorithm is applied. AdaBoost makes a weak learner that classifies two classes, but since target is multiclass, ValueError occurs.

To Reproduce Steps to reproduce the behavior:

  1. Show your code calling generate_code().
script ```python cls = SapientML( target_columns=["species"], add_explanation=True, split_train_size=0.75, hyperparameter_tuning=True, hyperparameter_tuning_n_trials=10, hyperparameter_tuning_timeout=600, ) model = cls.fit(train_data_all).model ```
  1. Attach the datasets or dataframes input to generate_code() if possible. https://scikit-learn.org/stable/auto_examples/datasets/plot_iris_dataset.html

  2. Show the generated code such as 1_default.py when it was generated.

generated code ```python # *** GENERATED PIPELINE *** # LOAD DATA import pandas as pd train_dataset = pd.read_pickle(r"/home/sugawara/PoC/mobilePF/outputs/training.pkl") # TRAIN-TEST SPLIT from sklearn.model_selection import train_test_split def split_dataset(dataset, train_size=0.75, random_state=17): train_dataset, test_dataset = train_test_split(dataset, train_size=train_size, random_state=random_state) return train_dataset, test_dataset train_dataset, test_dataset = split_dataset(train_dataset) train_dataset, validation_dataset = split_dataset(train_dataset) # SUBSAMPLE # If the number of rows of train_dataset is larger than sample_size, sample rows to sample_size for speedup. from lib.sample_dataset import sample_dataset train_dataset = sample_dataset( dataframe=train_dataset, sample_size=100000, target_columns=['species'], task_type='classification' ) test_dataset = validation_dataset # DETACH TARGET TARGET_COLUMNS = ['species'] feature_train = train_dataset.drop(TARGET_COLUMNS, axis=1) target_train = train_dataset[TARGET_COLUMNS].copy() feature_test = test_dataset.drop(TARGET_COLUMNS, axis=1) target_test = test_dataset[TARGET_COLUMNS].copy() # HYPERPARAMETER OPTIMIZATION import optuna from sklearn.ensemble import GradientBoostingClassifier # NEED CV: ex.) optuna.integration.OptunaSearchCV() class Objective(object): def __init__(self, feature_train, target_train, feature_test, target_test, __random_state): self.feature_train = feature_train self.target_train = target_train self.feature_test = feature_test self.target_test = target_test self.__random_state = __random_state def __call__(self, trial): def set_hyperparameters(trial): params = {} params['loss'] = trial.suggest_categorical('loss', ['log_loss', 'deviance', 'exponential']) # log_loss params['n_estimators'] = trial.suggest_int('n_estimators', 10, 1000, log=True) # 100 params['subsample'] = trial.suggest_float('subsample', 0.2, 1) # 1 params['criterion'] = trial.suggest_categorical('criterion', ['friedman_mse', 'squared_error']) # 'friedman_mse' params['min_samples_leaf'] = trial.suggest_int('min_samples_leaf', 1, 32, log=True) # 1 params['max_features'] = trial.suggest_categorical('max_features', ['sqrt','log2', None]) # None return params # SET DATA import numpy as np if isinstance(self.feature_train, pd.DataFrame): feature_train = self.feature_train elif isinstance(self.feature_train, np.ndarray): feature_train = pd.DataFrame(self.feature_train) else: feature_train = pd.DataFrame(self.feature_train.toarray()) if isinstance(self.target_train, pd.DataFrame): target_train = self.target_train elif isinstance(self.target_train, np.ndarray): target_train = pd.DataFrame(self.target_train) else: target_train = pd.DataFrame(self.target_train.toarray()) if isinstance(self.feature_test, pd.DataFrame): feature_test = self.feature_test elif isinstance(self.feature_test, np.ndarray): feature_test = pd.DataFrame(self.feature_test) else: feature_test = pd.DataFrame(self.feature_test.toarray()) if isinstance(self.target_test, pd.DataFrame): target_test = self.target_test elif isinstance(self.target_test, np.ndarray): target_test = pd.DataFrame(self.target_test) else: target_test = pd.DataFrame(self.target_test.toarray()) # MODEL params = set_hyperparameters(trial) model = GradientBoostingClassifier(random_state=self.__random_state, **params) model.fit(feature_train, target_train.values.ravel()) y_pred = model.predict(feature_test) from sklearn import metrics score = metrics.f1_score(target_test, y_pred, average='macro') return score n_trials = 10 timeout = 600 random_state = 1023 random_state_model = 42 direction = 'maximize' study = optuna.create_study(direction=direction, sampler=optuna.samplers.TPESampler(seed=random_state)) default_hyperparameters = {'criterion': 'friedman_mse', 'loss': 'log_loss', 'max_features': None, 'min_samples_leaf': 1, 'n_estimators': 100, 'subsample': 1.0} study.enqueue_trial(default_hyperparameters) study.optimize(Objective(feature_train, target_train, feature_test, target_test, random_state_model), n_trials=n_trials, timeout=timeout) best_params = study.best_params print("best params:", best_params) print("RESULT: f1: " + str(study.best_value)) ```
  1. Show the messages of SapientML and/or generated code.
    ValueError: ExponentialLoss requires 2 classes; got 3 class(es)

Expected behavior

Environment (please complete the following information):

Additional context If the target has multiclass, the parameter "loss" must be set to "log_loss" only.