I have a multiple output regression problem that I want to cross validate with grouped Kfold.
import numpy as np
from flaml import AutoML
from sklearn.multioutput import MultiOutputRegressor
rng = np.random.RandomState(1338)
# Generate the class/group data
n_points = 100
X = rng.randn(100, 10)
np.random.seed(2023)
# two outputs
y = np.random.randn(n_points, 2)
# Generate uneven groups
group_prior = rng.dirichlet([2] * 10)
groups = np.repeat(np.arange(10), rng.multinomial(100, group_prior))
This setting without groups runs
settings = {
"time_budget": 3, # total running time in seconds
"metric": 'rmse',
"task": 'regression', # task type
"log_file_name": 'undestanding_cross_validation_default.log',
"log_training_metric": True, # whether to log training metric
"keep_search_state": True, # needed if you want to keep the cross validation information
"eval_method": "cv",
#"split_type": "group",
#"groups": groups,
"n_splits": 3
}
automl = MultiOutputRegressor(AutoML(**settings))
automl.fit(X, y)
This setting with groups doesn't run
settings = {
"time_budget": 3, # total running time in seconds
"metric": 'rmse',
"task": 'regression', # task type
"log_file_name": 'undestanding_cross_validation_groupkfold.log',
"log_training_metric": True, # whether to log training metric
"keep_search_state": True, # needed if you want to keep the cross validation information
"eval_method": "cv",
"split_type": "group",
"groups": groups,
"n_splits": 3
}
automl = MultiOutputRegressor(AutoML(**settings))
automl.fit(X, y)
[flaml.automl.automl: 02-27 13:27:34] {2716} INFO - task = regression
[flaml.automl.automl: 02-27 13:27:34] {2718} INFO - Data split method: group
[flaml.automl.automl: 02-27 13:27:34] {2721} INFO - Evaluation method: cv
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
Cell In [37], line 6
3 settings["log_file_name"] = 'undestanding_cross_validation_groupkfold.log'
5 automl = MultiOutputRegressor(AutoML(**settings))
----> 6 automl.fit(X, y)
File C:\ProgramData\mambaforge\envs\mamba_flaml\lib\site-packages\sklearn\multioutput.py:216, in _MultiOutputEstimator.fit(self, X, y, sample_weight, **fit_params)
212 raise ValueError("Underlying estimator does not support sample weights.")
214 fit_params_validated = _check_fit_params(X, fit_params)
--> 216 self.estimators_ = Parallel(n_jobs=self.n_jobs)(
217 delayed(_fit_estimator)(
218 self.estimator, X, y[:, i], sample_weight, **fit_params_validated
219 )
220 for i in range(y.shape[1])
221 )
223 if hasattr(self.estimators_[0], "n_features_in_"):
224 self.n_features_in_ = self.estimators_[0].n_features_in_
File C:\ProgramData\mambaforge\envs\mamba_flaml\lib\site-packages\sklearn\utils\parallel.py:63, in Parallel.__call__(self, iterable)
58 config = get_config()
59 iterable_with_config = (
60 (_with_config(delayed_func, config), args, kwargs)
61 for delayed_func, args, kwargs in iterable
62 )
---> 63 return super().__call__(iterable_with_config)
File ~\AppData\Roaming\Python\Python39\site-packages\joblib\parallel.py:1085, in Parallel.__call__(self, iterable)
1076 try:
1077 # Only set self._iterating to True if at least a batch
1078 # was dispatched. In particular this covers the edge
(...)
1082 # was very quick and its callback already dispatched all the
1083 # remaining jobs.
1084 self._iterating = False
-> 1085 if self.dispatch_one_batch(iterator):
1086 self._iterating = self._original_iterator is not None
1088 while self.dispatch_one_batch(iterator):
File ~\AppData\Roaming\Python\Python39\site-packages\joblib\parallel.py:901, in Parallel.dispatch_one_batch(self, iterator)
899 return False
900 else:
--> 901 self._dispatch(tasks)
902 return True
File ~\AppData\Roaming\Python\Python39\site-packages\joblib\parallel.py:819, in Parallel._dispatch(self, batch)
817 with self._lock:
818 job_idx = len(self._jobs)
--> 819 job = self._backend.apply_async(batch, callback=cb)
820 # A job can complete so quickly than its callback is
821 # called before we get here, causing self._jobs to
822 # grow. To ensure correct results ordering, .insert is
823 # used (rather than .append) in the following line
824 self._jobs.insert(job_idx, job)
File ~\AppData\Roaming\Python\Python39\site-packages\joblib\_parallel_backends.py:208, in SequentialBackend.apply_async(self, func, callback)
206 def apply_async(self, func, callback=None):
207 """Schedule a func to be run"""
--> 208 result = ImmediateResult(func)
209 if callback:
210 callback(result)
File ~\AppData\Roaming\Python\Python39\site-packages\joblib\_parallel_backends.py:597, in ImmediateResult.__init__(self, batch)
594 def __init__(self, batch):
595 # Don't delay the application, to avoid keeping the input
596 # arguments in memory
--> 597 self.results = batch()
File ~\AppData\Roaming\Python\Python39\site-packages\joblib\parallel.py:288, in BatchedCalls.__call__(self)
284 def __call__(self):
285 # Set the default nested backend to self._backend but do not set the
286 # change the default number of processes to -1
287 with parallel_backend(self._backend, n_jobs=self._n_jobs):
--> 288 return [func(*args, **kwargs)
289 for func, args, kwargs in self.items]
File ~\AppData\Roaming\Python\Python39\site-packages\joblib\parallel.py:288, in <listcomp>(.0)
284 def __call__(self):
285 # Set the default nested backend to self._backend but do not set the
286 # change the default number of processes to -1
287 with parallel_backend(self._backend, n_jobs=self._n_jobs):
--> 288 return [func(*args, **kwargs)
289 for func, args, kwargs in self.items]
File C:\ProgramData\mambaforge\envs\mamba_flaml\lib\site-packages\sklearn\utils\parallel.py:123, in _FuncWrapper.__call__(self, *args, **kwargs)
121 config = {}
122 with config_context(**config):
--> 123 return self.function(*args, **kwargs)
File C:\ProgramData\mambaforge\envs\mamba_flaml\lib\site-packages\sklearn\multioutput.py:49, in _fit_estimator(estimator, X, y, sample_weight, **fit_params)
47 estimator.fit(X, y, sample_weight=sample_weight, **fit_params)
48 else:
---> 49 estimator.fit(X, y, **fit_params)
50 return estimator
File ~\AppData\Roaming\Python\Python39\site-packages\flaml\automl\automl.py:2766, in AutoML.fit(self, X_train, y_train, dataframe, label, metric, task, n_jobs, log_file_name, estimator_list, time_budget, max_iter, sample, ensemble, eval_method, log_type, model_history, split_ratio, n_splits, log_training_metric, mem_thres, pred_time_limit, train_time_limit, X_val, y_val, sample_weight_val, groups_val, groups, verbose, retrain_full, split_type, learner_selector, hpo_method, starting_points, seed, n_concurrent_trials, keep_search_state, preserve_checkpoint, early_stop, append_log, auto_augment, min_sample_size, use_ray, use_spark, free_mem_ratio, metric_constraints, custom_hp, cv_score_agg_func, skip_transform, fit_kwargs_by_estimator, **fit_kwargs)
2764 self._min_sample_size = _sample_size_from_starting_points or min_sample_size
2765 self._min_sample_size_input = min_sample_size
-> 2766 self._prepare_data(eval_method, split_ratio, n_splits)
2768 if isinstance(self._min_sample_size, dict):
2769 self._sample = {
2770 (
2771 k,
(...)
2780 for k in self._min_sample_size.keys()
2781 }
File ~\AppData\Roaming\Python\Python39\site-packages\flaml\automl\automl.py:1525, in AutoML._prepare_data(self, eval_method, split_ratio, n_splits)
1521 return
1522 if self._split_type == "group":
1523 # logger.info("Using GroupKFold")
1524 assert (
-> 1525 len(self._state.groups_all) == y_train_all.size
1526 ), "the length of groups must match the number of examples"
1527 assert (
1528 len(np.unique(self._state.groups_all)) >= n_splits
1529 ), "the number of groups must be equal or larger than n_splits"
1530 self._state.kf = GroupKFold(n_splits)
TypeError: object of type 'NoneType' has no len()
Is group K fold supported for MultiOutputRegressor? How do I make self._state not None?
I have a multiple output regression problem that I want to cross validate with grouped Kfold.
This setting without groups runs
This setting with groups doesn't run
Is group K fold supported for MultiOutputRegressor? How do I make self._state not None?