microsoft / FLAML

A fast library for AutoML and tuning. Join our Discord: https://discord.gg/Cppx2vSPVP.
https://microsoft.github.io/FLAML/
MIT License
3.91k stars 508 forks source link

MultiOutputRegressor with Grouped Kfold #935

Open yungchidanielcho opened 1 year ago

yungchidanielcho commented 1 year ago

I have a multiple output regression problem that I want to cross validate with grouped Kfold.

import numpy as np
from flaml import AutoML
from sklearn.multioutput import MultiOutputRegressor

rng = np.random.RandomState(1338)
# Generate the class/group data
n_points = 100
X = rng.randn(100, 10)

np.random.seed(2023)
# two outputs
y = np.random.randn(n_points, 2)  
# Generate uneven groups
group_prior = rng.dirichlet([2] * 10)
groups = np.repeat(np.arange(10), rng.multinomial(100, group_prior))

This setting without groups runs


settings = {
    "time_budget": 3,  # total running time in seconds
    "metric": 'rmse', 
    "task": 'regression',  # task type    
    "log_file_name": 'undestanding_cross_validation_default.log',
    "log_training_metric": True,  # whether to log training metric
    "keep_search_state": True, # needed if you want to keep the cross validation information
    "eval_method": "cv",
    #"split_type": "group",
    #"groups": groups,
    "n_splits": 3
}
automl = MultiOutputRegressor(AutoML(**settings))
automl.fit(X, y)

This setting with groups doesn't run

settings = {
    "time_budget": 3,  # total running time in seconds
    "metric": 'rmse', 
    "task": 'regression',  # task type    
    "log_file_name": 'undestanding_cross_validation_groupkfold.log',
    "log_training_metric": True,  # whether to log training metric
    "keep_search_state": True, # needed if you want to keep the cross validation information
    "eval_method": "cv",
    "split_type": "group",
    "groups": groups,
    "n_splits": 3
}
automl = MultiOutputRegressor(AutoML(**settings))
automl.fit(X, y)
[flaml.automl.automl: 02-27 13:27:34] {2716} INFO - task = regression
[flaml.automl.automl: 02-27 13:27:34] {2718} INFO - Data split method: group
[flaml.automl.automl: 02-27 13:27:34] {2721} INFO - Evaluation method: cv
---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
Cell In [37], line 6
      3 settings["log_file_name"] = 'undestanding_cross_validation_groupkfold.log'
      5 automl = MultiOutputRegressor(AutoML(**settings))
----> 6 automl.fit(X, y)

File C:\ProgramData\mambaforge\envs\mamba_flaml\lib\site-packages\sklearn\multioutput.py:216, in _MultiOutputEstimator.fit(self, X, y, sample_weight, **fit_params)
    212     raise ValueError("Underlying estimator does not support sample weights.")
    214 fit_params_validated = _check_fit_params(X, fit_params)
--> 216 self.estimators_ = Parallel(n_jobs=self.n_jobs)(
    217     delayed(_fit_estimator)(
    218         self.estimator, X, y[:, i], sample_weight, **fit_params_validated
    219     )
    220     for i in range(y.shape[1])
    221 )
    223 if hasattr(self.estimators_[0], "n_features_in_"):
    224     self.n_features_in_ = self.estimators_[0].n_features_in_

File C:\ProgramData\mambaforge\envs\mamba_flaml\lib\site-packages\sklearn\utils\parallel.py:63, in Parallel.__call__(self, iterable)
     58 config = get_config()
     59 iterable_with_config = (
     60     (_with_config(delayed_func, config), args, kwargs)
     61     for delayed_func, args, kwargs in iterable
     62 )
---> 63 return super().__call__(iterable_with_config)

File ~\AppData\Roaming\Python\Python39\site-packages\joblib\parallel.py:1085, in Parallel.__call__(self, iterable)
   1076 try:
   1077     # Only set self._iterating to True if at least a batch
   1078     # was dispatched. In particular this covers the edge
   (...)
   1082     # was very quick and its callback already dispatched all the
   1083     # remaining jobs.
   1084     self._iterating = False
-> 1085     if self.dispatch_one_batch(iterator):
   1086         self._iterating = self._original_iterator is not None
   1088     while self.dispatch_one_batch(iterator):

File ~\AppData\Roaming\Python\Python39\site-packages\joblib\parallel.py:901, in Parallel.dispatch_one_batch(self, iterator)
    899     return False
    900 else:
--> 901     self._dispatch(tasks)
    902     return True

File ~\AppData\Roaming\Python\Python39\site-packages\joblib\parallel.py:819, in Parallel._dispatch(self, batch)
    817 with self._lock:
    818     job_idx = len(self._jobs)
--> 819     job = self._backend.apply_async(batch, callback=cb)
    820     # A job can complete so quickly than its callback is
    821     # called before we get here, causing self._jobs to
    822     # grow. To ensure correct results ordering, .insert is
    823     # used (rather than .append) in the following line
    824     self._jobs.insert(job_idx, job)

File ~\AppData\Roaming\Python\Python39\site-packages\joblib\_parallel_backends.py:208, in SequentialBackend.apply_async(self, func, callback)
    206 def apply_async(self, func, callback=None):
    207     """Schedule a func to be run"""
--> 208     result = ImmediateResult(func)
    209     if callback:
    210         callback(result)

File ~\AppData\Roaming\Python\Python39\site-packages\joblib\_parallel_backends.py:597, in ImmediateResult.__init__(self, batch)
    594 def __init__(self, batch):
    595     # Don't delay the application, to avoid keeping the input
    596     # arguments in memory
--> 597     self.results = batch()

File ~\AppData\Roaming\Python\Python39\site-packages\joblib\parallel.py:288, in BatchedCalls.__call__(self)
    284 def __call__(self):
    285     # Set the default nested backend to self._backend but do not set the
    286     # change the default number of processes to -1
    287     with parallel_backend(self._backend, n_jobs=self._n_jobs):
--> 288         return [func(*args, **kwargs)
    289                 for func, args, kwargs in self.items]

File ~\AppData\Roaming\Python\Python39\site-packages\joblib\parallel.py:288, in <listcomp>(.0)
    284 def __call__(self):
    285     # Set the default nested backend to self._backend but do not set the
    286     # change the default number of processes to -1
    287     with parallel_backend(self._backend, n_jobs=self._n_jobs):
--> 288         return [func(*args, **kwargs)
    289                 for func, args, kwargs in self.items]

File C:\ProgramData\mambaforge\envs\mamba_flaml\lib\site-packages\sklearn\utils\parallel.py:123, in _FuncWrapper.__call__(self, *args, **kwargs)
    121     config = {}
    122 with config_context(**config):
--> 123     return self.function(*args, **kwargs)

File C:\ProgramData\mambaforge\envs\mamba_flaml\lib\site-packages\sklearn\multioutput.py:49, in _fit_estimator(estimator, X, y, sample_weight, **fit_params)
     47     estimator.fit(X, y, sample_weight=sample_weight, **fit_params)
     48 else:
---> 49     estimator.fit(X, y, **fit_params)
     50 return estimator

File ~\AppData\Roaming\Python\Python39\site-packages\flaml\automl\automl.py:2766, in AutoML.fit(self, X_train, y_train, dataframe, label, metric, task, n_jobs, log_file_name, estimator_list, time_budget, max_iter, sample, ensemble, eval_method, log_type, model_history, split_ratio, n_splits, log_training_metric, mem_thres, pred_time_limit, train_time_limit, X_val, y_val, sample_weight_val, groups_val, groups, verbose, retrain_full, split_type, learner_selector, hpo_method, starting_points, seed, n_concurrent_trials, keep_search_state, preserve_checkpoint, early_stop, append_log, auto_augment, min_sample_size, use_ray, use_spark, free_mem_ratio, metric_constraints, custom_hp, cv_score_agg_func, skip_transform, fit_kwargs_by_estimator, **fit_kwargs)
   2764 self._min_sample_size = _sample_size_from_starting_points or min_sample_size
   2765 self._min_sample_size_input = min_sample_size
-> 2766 self._prepare_data(eval_method, split_ratio, n_splits)
   2768 if isinstance(self._min_sample_size, dict):
   2769     self._sample = {
   2770         (
   2771             k,
   (...)
   2780         for k in self._min_sample_size.keys()
   2781     }

File ~\AppData\Roaming\Python\Python39\site-packages\flaml\automl\automl.py:1525, in AutoML._prepare_data(self, eval_method, split_ratio, n_splits)
   1521     return
   1522 if self._split_type == "group":
   1523     # logger.info("Using GroupKFold")
   1524     assert (
-> 1525         len(self._state.groups_all) == y_train_all.size
   1526     ), "the length of groups must match the number of examples"
   1527     assert (
   1528         len(np.unique(self._state.groups_all)) >= n_splits
   1529     ), "the number of groups must be equal or larger than n_splits"
   1530     self._state.kf = GroupKFold(n_splits)

TypeError: object of type 'NoneType' has no len()

Is group K fold supported for MultiOutputRegressor? How do I make self._state not None?

sonichi commented 1 year ago

Could you try adding "keep_search_state": True in settings?