mljar / mljar-supervised

Python package for AutoML on Tabular Data with Feature Engineering, Hyper-Parameters Tuning, Explanations and Automatic Documentation
https://mljar.com
MIT License
3.02k stars 403 forks source link

Categorical preprocesing warning #777

Open maciekmalachowski opened 2 weeks ago

maciekmalachowski commented 2 weeks ago

miniconda3\Lib\site-packages\supervised\preprocessing\preprocessing_categorical.py:81:

FutureWarning: Setting an item of incompatible dtype is deprecated and will raise in a future error of pandas. Value '[0 0 0 ... 0 0 0]' has dtype incompatible with category, please explicitly cast to a compatible dtype first.

Dataset: https://www.openml.org/search?type=data&sort=runs&status=active&id=1114

pplonski commented 4 days ago

Please provide code to reproduce the issue.

maciekmalachowski commented 4 days ago
def generate_models():
    datasets = [
        kddcup09_upselling.get_data()
    ]

    algorithms=[
            'Baseline',
            'CatBoost',
            'Decision Tree',
            'Extra Trees',
            'LightGBM',
            'Neural Network',
            'Random Forest',
            'Xgboost'
    ]

    for data in datasets:
        for alg in algorithms:
            # create directions for AutoML
            if not os.path.exists(f"AutoML/{data[2]}/{alg}"):
                os.makedirs(f"AutoML/{data[2]}/{alg}")

            # various datasets need either rmse or accuracy as metric
            if data[-1] == "reg":
                eval_metric = "rmse"
            else:
                eval_metric = "accuracy"

            # create automl object
            automl = AutoML(
                mode="Compete", 
                total_time_limit=600, 
                results_path=f"AutoML/{data[2]}/{alg}", 
                algorithms=[alg],
                train_ensemble=False,
                golden_features=False,
                features_selection=False,
                stack_models=False,
                kmeans_features=False,
                explain_level=0,
                boost_on_errors=False,
                eval_metric=eval_metric,
                validation_strategy={
                "validation_type": "kfold",
                "k_folds": 5,
                "shuffle": True,
                "stratify": True,
                "random_seed": 123
                },
                start_random_models=10, 
                hill_climbing_steps=3, 
                top_models_to_improve=3, 
                random_state=1234)

            # train automl
            automl.fit(data[0], data[1])
  from sklearn.datasets import fetch_openml

  def get_data():
      # read data from openml page
      name = "Kddcup09_upselling"
      dataset_type = "binary"
      data = fetch_openml(data_id=1114, as_frame=True)
      X = data.data
      y = data.target

      return X, y, name, dataset_type