aimclub / FEDOT

Automated modeling and machine learning framework FEDOT
https://fedot.readthedocs.io
BSD 3-Clause "New" or "Revised" License
619 stars 84 forks source link

[Bug]: Stratify ValueError: The least populated class in y has only 1 member, which is too few. [...] #1307

Closed DRMPN closed 20 hours ago

DRMPN commented 5 days ago

Stratification fails on fit() function:

ValueError: The least populated class in y has only 1 member, which is too few. The minimum number of groups for any class cannot be less than 2.

Expected Behavior

train_test_data_setup() function emits correct stratification or fit() method allows stratify parameter to enable/disable it

Current Behavior

Fedot cannot make data stratification and fails to fit a data

---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
File /opt/conda/lib/python3.10/site-packages/fedot/api/api_utils/assumptions/assumptions_handler.py:67, in AssumptionsHandler.fit_assumption_and_check_correctness(self, pipeline, pipelines_cache, preprocessing_cache, eval_n_jobs)
     66 try:
---> 67     data_train, data_test = train_test_data_setup(self.data)
     68     self.log.info('Initial pipeline fitting started')

File /opt/conda/lib/python3.10/site-packages/fedot/core/data/data_split.py:199, in train_test_data_setup(data, split_ratio, shuffle, shuffle_flag, stratify, random_seed, validation_blocks)
    198     split_func = split_func_dict[data.data_type]
--> 199     train_data, test_data = split_func(data, **input_arguments)
    200 elif isinstance(data, MultiModalData):

File /opt/conda/lib/python3.10/site-packages/fedot/core/data/data_split.py:101, in _split_any(data, split_ratio, shuffle, stratify, random_seed, **kwargs)
     99 stratify_labels = data.target if stratify else None
--> 101 train_ids, test_ids = train_test_split(np.arange(0, len(data.target)),
    102                                        test_size=1. - split_ratio,
    103                                        shuffle=shuffle,
    104                                        random_state=random_seed,
    105                                        stratify=stratify_labels)
    107 train_data = _split_input_data_by_indexes(data, index=train_ids)

File /opt/conda/lib/python3.10/site-packages/sklearn/model_selection/_split.py:2583, in train_test_split(test_size, train_size, random_state, shuffle, stratify, *arrays)
   2581     cv = CVClass(test_size=n_test, train_size=n_train, random_state=random_state)
-> 2583     train, test = next(cv.split(X=arrays[0], y=stratify))
   2585 return list(
   2586     chain.from_iterable(
   2587         (_safe_indexing(a, train), _safe_indexing(a, test)) for a in arrays
   2588     )
   2589 )

File /opt/conda/lib/python3.10/site-packages/sklearn/model_selection/_split.py:1689, in BaseShuffleSplit.split(self, X, y, groups)
   1688 X, y, groups = indexable(X, y, groups)
-> 1689 for train, test in self._iter_indices(X, y, groups):
   1690     yield train, test

File /opt/conda/lib/python3.10/site-packages/sklearn/model_selection/_split.py:2078, in StratifiedShuffleSplit._iter_indices(self, X, y, groups)
   2077 if np.min(class_counts) < 2:
-> 2078     raise ValueError(
   2079         "The least populated class in y has only 1"
   2080         " member, which is too few. The minimum"
   2081         " number of groups for any class cannot"
   2082         " be less than 2."
   2083     )
   2085 if n_train < n_classes:

ValueError: The least populated class in y has only 1 member, which is too few. The minimum number of groups for any class cannot be less than 2.

During handling of the above exception, another exception occurred:

ValueError                                Traceback (most recent call last)
Cell In[61], line 1
----> 1 auto_model.fit(features=train)

File /opt/conda/lib/python3.10/site-packages/fedot/api/main.py:176, in Fedot.fit(self, features, target, predefined_model)
    172     self.current_pipeline = PredefinedModel(predefined_model, self.train_data, self.log,
    173                                             use_input_preprocessing=self.params.get(
    174                                                 'use_input_preprocessing')).fit()
    175 else:
--> 176     self.current_pipeline, self.best_models, self.history = self.api_composer.obtain_model(self.train_data)
    178     if self.current_pipeline is None:
    179         raise ValueError('No models were found')

File /opt/conda/lib/python3.10/site-packages/fedot/api/api_utils/api_composer.py:63, in ApiComposer.obtain_model(self, train_data)
     59 with_tuning = self.params.get('with_tuning')
     61 self.timer = ApiTime(time_for_automl=timeout, with_tuning=with_tuning)
---> 63 initial_assumption, fitted_assumption = self.propose_and_fit_initial_assumption(train_data)
     65 multi_objective = len(self.metrics) > 1
     66 self.params.init_params_for_composing(self.timer.timedelta_composing, multi_objective)

File /opt/conda/lib/python3.10/site-packages/fedot/api/api_utils/api_composer.py:107, in ApiComposer.propose_and_fit_initial_assumption(self, train_data)
    100 initial_assumption = assumption_handler.propose_assumptions(self.params.get('initial_assumption'),
    101                                                             available_operations,
    102                                                             use_input_preprocessing=self.params.get(
    103                                                                 'use_input_preprocessing'))
    105 with self.timer.launch_assumption_fit():
    106     fitted_assumption = \
--> 107         assumption_handler.fit_assumption_and_check_correctness(deepcopy(initial_assumption[0]),
    108                                                                 pipelines_cache=self.pipelines_cache,
    109                                                                 preprocessing_cache=self.preprocessing_cache,
    110                                                                 eval_n_jobs=self.params.n_jobs)
    112 self.log.message(
    113     f'Initial pipeline was fitted in {round(self.timer.assumption_fit_spend_time.total_seconds(), 1)} sec.')
    115 self.params.update(preset=assumption_handler.propose_preset(preset, self.timer, n_jobs=self.params.n_jobs))

File /opt/conda/lib/python3.10/site-packages/fedot/api/api_utils/assumptions/assumptions_handler.py:86, in AssumptionsHandler.fit_assumption_and_check_correctness(self, pipeline, pipelines_cache, preprocessing_cache, eval_n_jobs)
     81     MemoryAnalytics.log(self.log,
     82                         additional_info='fitting of the initial pipeline',
     83                         logging_level=45)  # message logging level
     85 except Exception as ex:
---> 86     self._raise_evaluating_exception(ex)
     87 return pipeline

File /opt/conda/lib/python3.10/site-packages/fedot/api/api_utils/assumptions/assumptions_handler.py:94, in AssumptionsHandler._raise_evaluating_exception(self, ex)
     92 self.log.info(fit_failed_info)
     93 print(traceback.format_exc())
---> 94 raise ValueError(advice_info)

ValueError: Initial pipeline fit was failed due to: The least populated class in y has only 1 member, which is too few. The minimum number of groups for any class cannot be less than 2.. Check pipeline structure and the correctness of the data

Possible Solution

Possible solution can be found in Fedot.Industrial

Steps to Reproduce

Live example - https://www.kaggle.com/code/eliyahusanti/steel-plates-faults

Context [OPTIONAL]

Participating in a Kaggle Playground Series S4E3 competition.