microsoft / FLAML

A fast library for AutoML and tuning. Join our Discord: https://discord.gg/Cppx2vSPVP.
https://microsoft.github.io/FLAML/
MIT License
3.75k stars 495 forks source link

Preprocessing Data is missng ,raise key error. automl self was created data that is not repreat behaviour. #1278

Open 731315163 opened 4 months ago

731315163 commented 4 months ago

= X[self.regressors] File "/usr/app/regression/venv/lib/python3.10/site-packages/pandas/core/frame.py", line 3899, in getitem indexer = self.columns._get_indexer_strict(key, "columns")[1] File "/usr/app/regression/venv/lib/python3.10/site-packages/pandas/core/indexes/base.py", line 6115, in _get_indexer_strict self._raise_if_missing(keyarr, indexer, axis_name) File "/usr/app/regression/venv/lib/python3.10/site-packages/pandas/core/indexes/base.py", line 6179, in _raise_if_missing raise KeyError(f"{not_found} not in index") KeyError: "['index_sin1', 'index_sin6', 'index_cos5', 'index_second_cos', 'index_sin2', 'index_second_sin', 'index_cos4', 'index_minute_cos', 'index_dayofweek_sin', 'index_cos1', 'index_month_cos', 'index_dayofyear_sin', 'index_sin3', 'index_sin5', 'index_cos3', 'index_hour_sin', 'index_cos6', 'index_hour_cos', 'index_month_sin', 'index_minute_sin', 'index_sin4', 'index_cos2', 'index_quarter_sin', 'index_quarter_cos', 'index_dayofyear_cos', 'index_dayofweek_cos'] not in index"

import pandas as pd
from flaml.automl import AutoML, logger_formatter
from flaml.tune.searcher import CFO, BlendSearch, FLOW2, BlendSearchTuner
import numpy as np
from libX import PreprocessingData as data
import pickle
import os.path as path

savepath = data.JoinCurDir("automl.pkl")
datename = "DATE"
openen = "Open"

def train():

    traindata = data.get_rawdata_df(["DATE", "WM2NS", "UNRATE", "Open"])
    traindata.reset_index(inplace=True)
    traindata[datename] = pd.to_datetime(traindata[datename], format="%Y-%m-%d")
    traindata[openen] = pd.to_numeric(traindata[openen])
    traindata[datename] = traindata[datename].asfreq("D")
    traindata.set_index(keys=datename, inplace=True)

    print(traindata.head(3))
    # print(trainx[0:2])
    # print(trainy[0:2])
    automl = AutoML()
    automl_settings = {
        "task": "ts_forecast",
        "time_budget": 60 * 10,
        # "estimator_list": ["prophet", "arima", "sarimax"],
        "log_file_name": "ts_forecast.log",
        "period": 14,
    }

    automl.fit(
        dataframe=traindata,  # a single column of timestamp
        label=openen,  # value for each timestamp
        # time horizon to forecast, e.g., 12 months

        # split_type="time",

        ensemble=True,

        early_stop=True,

        # skip_transform=True,
        **automl_settings
    )
    automl.save_best_config(savepath)
    with open(savepath, "wb") as f:
        pickle.dump(automl, f, pickle.HIGHEST_PROTOCOL)
    return automl

def findmodel():
    if path.exists(savepath):
        # At prediction time
        with open(savepath, "rb") as f:
            return pickle.load(f)
    else:
        return train()

datetimetest = pd.DataFrame(
    {
        datename: [1, 2, 3],
        "WM2NS": [20966.057142857142, 20963.77142857143, 20961.485714285714],
        "UNRATE": [3.7, 3.7, 3.7],
    }
)
automl = findmodel()
datetimetest.set_index(datename)
pred = automl.predict(datetimetest)
print(pred)
thinkall commented 4 months ago

Hi @731315163 , the error message is related to pandas and the data itself. Can you check the index stuff works w/o involving flaml? Besides, you're using the same savepath for both best config and the automl instance.