CV Score after 1 Generation is -inf, ValueError: Found input variables with inconsistent numbers of samples: [16, 16, 1]

Intro

I am attempting to use a custom scoring function that returns the average of several scikit-learn scoring metrics (as a Proof of Concept idea). The code I have now (which I will post below, along with a link to the dataset in parquet format that I'm using for reproducibility) returns the following traceback (sorry for the length of it, but figured it's best to be thorough):

Generation 1 - Current best internal CV score: -inf
Traceback (most recent call last):
  File "C:\Users\chalu\AppData\Roaming\Python\Python37\site-packages\tpot\base.py", line 828, in fit
    log_file=self.log_file_,
  File "C:\Users\chalu\AppData\Roaming\Python\Python37\site-packages\tpot\gp_deap.py", line 281, in eaMuPlusLambda
    per_generation_function(gen)
  File "C:\Users\chalu\AppData\Roaming\Python\Python37\site-packages\tpot\base.py", line 1176, in _check_periodic_pipeline
    self._update_top_pipeline()
  File "C:\Users\chalu\AppData\Roaming\Python\Python37\site-packages\tpot\base.py", line 931, in _update_top_pipeline
    error_score="raise",
  File "C:\Users\chalu\AppData\Roaming\Python\Python37\lib\site-packages\sklearn\model_selection\_validation.py", line 520, in cross_val_score
    error_score=error_score,
  File "C:\Users\chalu\AppData\Roaming\Python\Python37\lib\site-packages\sklearn\model_selection\_validation.py", line 283, in cross_validate
    for train, test in cv.split(X, y, groups)
  File "C:\Users\chalu\AppData\Roaming\Python\Python37\site-packages\joblib\parallel.py", line 1041, in __call__
    if self.dispatch_one_batch(iterator):
  File "C:\Users\chalu\AppData\Roaming\Python\Python37\site-packages\joblib\parallel.py", line 859, in dispatch_one_batch
    self._dispatch(tasks)
  File "C:\Users\chalu\AppData\Roaming\Python\Python37\site-packages\joblib\parallel.py", line 777, in _dispatch
    job = self._backend.apply_async(batch, callback=cb)
  File "C:\Users\chalu\AppData\Roaming\Python\Python37\site-packages\joblib\_parallel_backends.py", line 208, in apply_async
    result = ImmediateResult(func)
  File "C:\Users\chalu\AppData\Roaming\Python\Python37\site-packages\joblib\_parallel_backends.py", line 572, in __init__
    self.results = batch()
  File "C:\Users\chalu\AppData\Roaming\Python\Python37\site-packages\joblib\parallel.py", line 263, in __call__
    for func, args, kwargs in self.items]
  File "C:\Users\chalu\AppData\Roaming\Python\Python37\site-packages\joblib\parallel.py", line 263, in <listcomp>
    for func, args, kwargs in self.items]
  File "C:\Users\chalu\AppData\Roaming\Python\Python37\lib\site-packages\sklearn\utils\fixes.py", line 216, in __call__
    return self.function(*args, **kwargs)
  File "C:\Users\chalu\AppData\Roaming\Python\Python37\lib\site-packages\sklearn\model_selection\_validation.py", line 702, in _fit_and_score
    test_scores = _score(estimator, X_test, y_test, scorer, error_score)
  File "C:\Users\chalu\AppData\Roaming\Python\Python37\lib\site-packages\sklearn\model_selection\_validation.py", line 761, in _score
    scores = scorer(estimator, X_test, y_test)
  File "C:\Users\chalu\AppData\Roaming\Python\Python37\lib\site-packages\sklearn\metrics\_scorer.py", line 103, in __call__
    score = scorer._score(cached_call, estimator, *args, **kwargs)
  File "C:\Users\chalu\AppData\Roaming\Python\Python37\lib\site-packages\sklearn\metrics\_scorer.py", line 264, in _score
    return self._sign * self._score_func(y_true, y_pred, **self._kwargs)
  File "test2.py", line 65, in custom_metric_function
    adjusted=False)))
  File "C:\Users\chalu\AppData\Roaming\Python\Python37\lib\site-packages\sklearn\metrics\_classification.py", line 1983, in balanced_accuracy_score
    C = confusion_matrix(y_true, y_pred, sample_weight=sample_weight)
  File "C:\Users\chalu\AppData\Roaming\Python\Python37\lib\site-packages\sklearn\metrics\_classification.py", line 328, in confusion_matrix
    check_consistent_length(y_true, y_pred, sample_weight)
  File "C:\Users\chalu\AppData\Roaming\Python\Python37\lib\site-packages\sklearn\utils\validation.py", line 334, in check_consistent_length
    % [int(l) for l in lengths]
ValueError: Found input variables with inconsistent numbers of samples: [16, 16, 1]

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "test2.py", line 153, in <module>
    results = tpot.fit(X_train.values, y_train.values)
  File "C:\Users\chalu\AppData\Roaming\Python\Python37\site-packages\tpot\base.py", line 863, in fit
    raise e
  File "C:\Users\chalu\AppData\Roaming\Python\Python37\site-packages\tpot\base.py", line 854, in fit
    self._update_top_pipeline()
  File "C:\Users\chalu\AppData\Roaming\Python\Python37\site-packages\tpot\base.py", line 931, in _update_top_pipeline
    error_score="raise",
  File "C:\Users\chalu\AppData\Roaming\Python\Python37\lib\site-packages\sklearn\model_selection\_validation.py", line 520, in cross_val_score
    error_score=error_score,
  File "C:\Users\chalu\AppData\Roaming\Python\Python37\lib\site-packages\sklearn\model_selection\_validation.py", line 283, in cross_validate
    for train, test in cv.split(X, y, groups)
  File "C:\Users\chalu\AppData\Roaming\Python\Python37\site-packages\joblib\parallel.py", line 1041, in __call__
    if self.dispatch_one_batch(iterator):
  File "C:\Users\chalu\AppData\Roaming\Python\Python37\site-packages\joblib\parallel.py", line 859, in dispatch_one_batch
    self._dispatch(tasks)
  File "C:\Users\chalu\AppData\Roaming\Python\Python37\site-packages\joblib\parallel.py", line 777, in _dispatch
    job = self._backend.apply_async(batch, callback=cb)
  File "C:\Users\chalu\AppData\Roaming\Python\Python37\site-packages\joblib\_parallel_backends.py", line 208, in apply_async
    result = ImmediateResult(func)
  File "C:\Users\chalu\AppData\Roaming\Python\Python37\site-packages\joblib\_parallel_backends.py", line 572, in __init__
    self.results = batch()
  File "C:\Users\chalu\AppData\Roaming\Python\Python37\site-packages\joblib\parallel.py", line 263, in __call__
    for func, args, kwargs in self.items]
  File "C:\Users\chalu\AppData\Roaming\Python\Python37\site-packages\joblib\parallel.py", line 263, in <listcomp>
    for func, args, kwargs in self.items]
  File "C:\Users\chalu\AppData\Roaming\Python\Python37\lib\site-packages\sklearn\utils\fixes.py", line 216, in __call__
    return self.function(*args, **kwargs)
  File "C:\Users\chalu\AppData\Roaming\Python\Python37\lib\site-packages\sklearn\model_selection\_validation.py", line 702, in _fit_and_score
    test_scores = _score(estimator, X_test, y_test, scorer, error_score)
  File "C:\Users\chalu\AppData\Roaming\Python\Python37\lib\site-packages\sklearn\model_selection\_validation.py", line 761, in _score
    scores = scorer(estimator, X_test, y_test)
  File "C:\Users\chalu\AppData\Roaming\Python\Python37\lib\site-packages\sklearn\metrics\_scorer.py", line 103, in __call__
    score = scorer._score(cached_call, estimator, *args, **kwargs)
  File "C:\Users\chalu\AppData\Roaming\Python\Python37\lib\site-packages\sklearn\metrics\_scorer.py", line 264, in _score
    return self._sign * self._score_func(y_true, y_pred, **self._kwargs)
  File "test2.py", line 65, in custom_metric_function
    adjusted=False)))
  File "C:\Users\chalu\AppData\Roaming\Python\Python37\lib\site-packages\sklearn\metrics\_classification.py", line 1983, in balanced_accuracy_score
    C = confusion_matrix(y_true, y_pred, sample_weight=sample_weight)
  File "C:\Users\chalu\AppData\Roaming\Python\Python37\lib\site-packages\sklearn\metrics\_classification.py", line 328, in confusion_matrix
    check_consistent_length(y_true, y_pred, sample_weight)
  File "C:\Users\chalu\AppData\Roaming\Python\Python37\lib\site-packages\sklearn\utils\validation.py", line 334, in check_consistent_length
    % [int(l) for l in lengths]
ValueError: Found input variables with inconsistent numbers of samples: [16, 16, 1]

Context of the issue

This error typically suggests that the shapes of my data (X_train.shape[0] and y_train.shape[0]) are different, however this is not the case as seen from this printout:

print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)
>>> (5933, 538) (1978, 538) (5933,) (1978,)

I found a similar issue #1148 where they suggested downgrading the scikit-learn version to 0.23.2, which I have already attempted, as well as running it with the latest version 1.0.2. Another issue #1147 turned me on to the idea that maybe my max_eval_time_mins=5 was too short (as pipelines that take too long return a -float(inf) as default), so I up'd it to 20 which is overkill for this dataset just to be safe, but that didn't help either. Note, I am also using cv=TimeSeriesSplit(n_splits=2) but this doesn't cause a problem.

To try and narrow this issue down a bit, I've tried changing the scoring=my_custom_scorer, parameter to a known metric of scoring='balanced_accuracy', to ensure the code runs, which it DOES, so I know it's something to do with my custom scoring metric.

To help even further, you'll see in my code below that I have commented out all of the other scoring metrics and I'm trying to simply make use of sklearn's balanced_accuracy_score function, so I'm NOT doing the proposed "average of several metrics" thing for now because I need to figure out how to make it "work" with just one first. So essentially, instead of defining scoring='balanced_accuracy',, I'm doing:

def custom_metric_function(y_true, y_pred):
    the_balanced_accuracy_score = min(1,float(balanced_accuracy_score(y_true=y_true, 
                                                                y_pred=y_pred, 
                                                                sample_weight=sample_weights,
                                                                adjusted=False)))

    # Return the one metric
    return the_balanced_accuracy_score

...
scoring=my_custom_scorer,
...

...which sort of SHOULD be the same thing?

I've commented the code well and reduced it to a minimal working example, so long as you are using the same dataset, which you can download in parquet format from my Google Drive link here (size is 32.9 Mb):

https://drive.google.com/file/d/1I1ZwnxsSv5iDna9nmSduW9T0UWMQzaIt/view?usp=sharing

Process to reproduce the issue

Download the dataset from above, and store it in a folder called data in the same location as this script. So you should have this_script.py and the data folder in the same directory.
Run the script
TPOT crashes after generation 1

Expected result

Training shouldn't crash.

Current result

I've narrowed it down to the custom scoring metric. The code runs when scoring='balanced_accuracy' is set, but not with scoring=my_custom_metric. The dataset is free of inf's and -inf's, and there are no nan's, which you can see once the .describe() function is saved to an excel document.

Possible fix

Not sure yet, but I've tried to mitigate inf in the balanced_accuracy_score function by implementing:

    the_balanced_accuracy_score = min(1, float(balanced_accuracy_score(y_true=y_true, 
                                                                                                                    y_pred=y_pred, 
                                                                                                                    sample_weight=sample_weights,
                                                                                                                    adjusted=False)))

...but this didn't solve it either. Again, the code works if not using the custom scoring metric as defined in the code.

To sum, I'm running Python 3.7.9, Windows 10, scikit-learn==1.0.2, and tpot==0.11.7.

Here is the code:

from statistics import mean
from numpy import unique
from pandas import read_parquet
from sklearn.model_selection import train_test_split, TimeSeriesSplit
from sklearn.metrics import auc, f1_score, make_scorer, balanced_accuracy_score, precision_score, recall_score, cohen_kappa_score, matthews_corrcoef, precision_recall_curve, roc_auc_score, roc_curve
from sklearn.utils import compute_sample_weight
from tpot import TPOTClassifier

# ------------------------------------------------------------------------------------------------ #
#                                             VARIABLES                                            #
# ------------------------------------------------------------------------------------------------ #
TEST_SIZE = 0.25

# ------------------------------------------------------------------------------------------------ #
#                                           IMPORT THE DF                                          #
# ------------------------------------------------------------------------------------------------ #
imported_df = read_parquet("./data/df.parquet")

# ------------------------------------------------------------------------------------------------ #
#                                          INSPECT THE DF                                          #
# ------------------------------------------------------------------------------------------------ #
print(imported_df['label'].value_counts())
print(imported_df.describe())
imported_df.describe().to_excel("df_describe.xlsx", index=True)

# ------------------------------------------------------------------------------------------------ #
#                                         TRAIN TEST SPLIT                                         #
# ------------------------------------------------------------------------------------------------ #
X_train, X_test, y_train, y_test = train_test_split(imported_df.drop('label', axis=1), imported_df['label'],
                                                    test_size=TEST_SIZE,
                                                    shuffle=False)
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

# ------------------------------------------------------------------------------------------------ #
#                     DEFINE CUSTOM SCORING METRIC - AVERAGE OF LOTS OF METRICS                    #
# ------------------------------------------------------------------------------------------------ #
def custom_metric_function(y_true, y_pred):

    # Compute sample weights re: imbalanced data
    sample_weights = compute_sample_weight("balanced", unique(y_true))

    # START CALCULATING SCORES
    # NOTE: I take the min of 1 and whatever the score is, is to mitigate any
    # possible inf values. If score is 0.89, that'll be the score it takes.
    # NOTE: Printing the scores doesn't work/show anything, but left them in
    # for debugging.
    the_balanced_accuracy_score = min(1,float(balanced_accuracy_score(y_true=y_true, 
                                                                y_pred=y_pred, 
                                                                sample_weight=sample_weights,
                                                                adjusted=False)))
    print(the_balanced_accuracy_score)

    # the_precision_score = min(1,float(precision_score(y_true=y_true, 
    #                                             y_pred=y_pred,
    #                                             labels=[0,1,2],
    #                                             average='weighted',
    #                                             sample_weight=sample_weights,
    #                                             zero_division=1,
    #                                             )))
    # print(the_precision_score)

    # the_recall_score = min(1,float(recall_score(y_true=y_true, 
    #                                         y_pred=y_pred,
    #                                         labels=[0,1,2],
    #                                         average='weighted',
    #                                         sample_weight=sample_weights,
    #                                         zero_division=1,
    #                                         )))
    # print(the_recall_score)

    # the_f1_score = min(1,float(f1_score(y_true=y_true, 
    #                                 y_pred=y_pred, 
    #                                 labels=[0,1,2],
    #                                 average='weighted', 
    #                                 sample_weight=sample_weights,
    #                                 zero_division=1,
    #                                 )))
    # print(the_f1_score)

    # the_cohen_kappa_score = min(1,float(max(0,cohen_kappa_score(y1=y_true,
    #                                                       y2=y_pred,
    #                                                       labels=[0,1,2],
    #                                                       sample_weight=sample_weights,
    #                                                       ))))
    # print(the_cohen_kappa_score)

    # the_matthews_corrcoef = min(1,float(max(0,matthews_corrcoef(y_true=y_true,
    #                                                       y_pred=y_pred,
    #                                                       sample_weight=sample_weights,
    #                                                       ))))
    # print(the_matthews_corrcoef)

    # the_roc_auc_score = min(1,float(roc_auc_score(y_true=y_true,
    #                                   y_pred=y_pred,
    #                                   average='weighted',
    #                                   sample_weight=sample_weights,
    #                                   multi_class="ovr")))
    # print(the_roc_auc_score)

    # Return the one metric
    return the_balanced_accuracy_score

    # # Return the average
    # return mean([the_balanced_accuracy_score,
    #             the_precision_score,
    #             the_recall_score,
    #             the_f1_score,
    #             the_cohen_kappa_score,
    #             the_matthews_corrcoef,
    #             the_roc_auc_score,
    #             ])

# ------------------------------------------------------------------------------------------------ #
#                                    CREATE THE TPOT CLASSIFIER                                    #
# ------------------------------------------------------------------------------------------------ #
my_custom_scorer = make_scorer(custom_metric_function, greater_is_better=True) # , needs_proba=True

tpot = TPOTClassifier(generations=100, 
                     population_size=20,
                     offspring_size=None, 
                     mutation_rate=0.9,
                     crossover_rate=0.1,
                     scoring=my_custom_scorer,
                     cv=TimeSeriesSplit(n_splits=2), # Using time series split here
                     subsample=1.0, 
                     n_jobs=2,
                     max_time_mins=None, 
                     max_eval_time_mins=20, # 5
                     random_state=None, 
                    #  config_dict=classifier_config_dict,
                     template=None,
                     warm_start=False,
                     memory=None,
                     use_dask=False,
                     periodic_checkpoint_folder=None,
                     early_stop=2,
                     verbosity=2,
                     disable_update_check=False)

results = tpot.fit(X_train.values, y_train.values)
print(tpot.score(X_test.values, y_test.values))

EpistasisLab / tpot