ClimbsRocks / auto_ml

[UNMAINTAINED] Automated machine learning for analytics & production
http://auto-ml.readthedocs.io
MIT License
1.64k stars 310 forks source link

ValueError: fill value must be in categories #422

Open IFV opened 5 years ago

IFV commented 5 years ago

Hello you all!

I am having the error below on version 2.9.10 when trying to train a model following this https://auto-ml.readthedocs.io/en/latest/analytics.html:

**---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-106-c3c8faf1013e> in <module>()
      1 ml_predictor = Predictor(type_of_estimator='regressor', column_descriptions=column_descriptions)
      2 
----> 3 ml_predictor.train(train_subset)
      4 
      5 # Score the model on test data

~\Anaconda3\lib\site-packages\auto_ml\predictor.py in train(***failed resolving arguments***)
    650                         estimator_names = self._get_estimator_names()
    651 
--> 652                     X_df = self.fit_transformation_pipeline(X_df, y, estimator_names)
    653             else:
    654                 X_df = self.transformation_pipeline.transform(X_df)

~\Anaconda3\lib\site-packages\auto_ml\predictor.py in fit_transformation_pipeline(self, X_df, y, model_names)
    901 
    902         # We are intentionally overwriting X_df here to try to save some memory space
--> 903         X_df = ppl.fit_transform(X_df, y)
    904 
    905         self.transformation_pipeline = self._consolidate_pipeline(ppl)

~\Anaconda3\lib\site-packages\sklearn\pipeline.py in fit_transform(self, X, y, **fit_params)
    281         Xt, fit_params = self._fit(X, y, **fit_params)
    282         if hasattr(last_step, 'fit_transform'):
--> 283             return last_step.fit_transform(Xt, y, **fit_params)
    284         elif last_step is None:
    285             return Xt

~\Anaconda3\lib\site-packages\sklearn\base.py in fit_transform(self, X, y, **fit_params)
    518         else:
    519             # fit method of arity 2 (supervised transformation)
--> 520             return self.fit(X, y, **fit_params).transform(X)
    521 
    522 

~\Anaconda3\lib\site-packages\auto_ml\DataFrameVectorizer.py in transform(self, X, y)
    269 
    270     def transform(self, X, y=None):
--> 271         return self._transform(X)
    272 
    273     def get_feature_names(self):

~\Anaconda3\lib\site-packages\auto_ml\DataFrameVectorizer.py in _transform(self, X)
    177                     X[col] = 0
    178 
--> 179             X.fillna(0, inplace=True)
    180 
    181             for idx, col in enumerate(self.numerical_columns):

~\Anaconda3\lib\site-packages\pandas\core\frame.py in fillna(self, value, method, axis, inplace, limit, downcast, **kwargs)
   3788                      self).fillna(value=value, method=method, axis=axis,
   3789                                   inplace=inplace, limit=limit,
-> 3790                                   downcast=downcast, **kwargs)
   3791 
   3792     @Appender(_shared_docs['replace'] % _shared_doc_kwargs)

~\Anaconda3\lib\site-packages\pandas\core\generic.py in fillna(self, value, method, axis, inplace, limit, downcast)
   5425                 new_data = self._data.fillna(value=value, limit=limit,
   5426                                              inplace=inplace,
-> 5427                                              downcast=downcast)
   5428             elif isinstance(value, DataFrame) and self.ndim == 2:
   5429                 new_data = self.where(self.notna(), value)

~\Anaconda3\lib\site-packages\pandas\core\internals.py in fillna(self, **kwargs)
   3706 
   3707     def fillna(self, **kwargs):
-> 3708         return self.apply('fillna', **kwargs)
   3709 
   3710     def downcast(self, **kwargs):

~\Anaconda3\lib\site-packages\pandas\core\internals.py in apply(self, f, axes, filter, do_integrity_check, consolidate, **kwargs)
   3579 
   3580             kwargs['mgr'] = self
-> 3581             applied = getattr(b, f)(**kwargs)
   3582             result_blocks = _extend_blocks(applied, result_blocks)
   3583 

~\Anaconda3\lib\site-packages\pandas\core\internals.py in fillna(self, value, limit, inplace, downcast, mgr)
   2004                mgr=None):
   2005         values = self.values if inplace else self.values.copy()
-> 2006         values = values.fillna(value=value, limit=limit)
   2007         return [self.make_block_same_class(values=values,
   2008                                            placement=self.mgr_locs,

~\Anaconda3\lib\site-packages\pandas\util\_decorators.py in wrapper(*args, **kwargs)
    176                 else:
    177                     kwargs[new_arg_name] = new_arg_value
--> 178             return func(*args, **kwargs)
    179         return wrapper
    180     return _deprecate_kwarg

~\Anaconda3\lib\site-packages\pandas\core\arrays\categorical.py in fillna(self, value, method, limit)
   1754             elif is_hashable(value):
   1755                 if not isna(value) and value not in self.categories:
-> 1756                     raise ValueError("fill value must be in categories")
   1757 
   1758                 mask = values == -1

ValueError: fill value must be in categories

**

Am I missing any pre-processing step?

column_descriptions = { 'F11': 'output', 'F0': 'categorical', 'F2': 'categorical'}

Part of the dataset (no missing values) here:

Feat0 Feat1 Feat2 Feat3 Feat4 Feat5 Feat6 Feat7 Feat8 Feat9 Feat10 Feat11 Feat12 Feat13 Feat14 Feat15
1 1 1 557 557 736 720 5068 99 83 51 16 0 209 5 43
1 1 1 713 715 912 858 4069 59 43 24 14 -2 122 4 31
1 1 1 723 720 945 944 290 142 144 126 1 3 762 5 11
1 1 1 757 755 843 840 3038 46 45 20 3 2 89 6 20
1 1 1 822 805 1143 1123 287 141 138 112 20 17 936 4 25
1 1 1 1047 1045 1151 1155 570 64 70 53 -4 2 325 5 6
1 1 1 1246 1250 2030 2020 244 284 270 260 10 -4 2161 6 18
1 1 1 1323 1317 1644 1637 544 141 140 114 7 6 868 6 21
1 1 1 1405 1320 1528 1435 3604 83 75 57 53 45 281 13 13
1 1 1 1406 1406 1516 1521 43 70 75 60 -5 0 399 2 8
1 1 1 1510 1510 1607 1609 958 57 59 36 -2 0 193 6 15
1 1 1 1512 1435 1806 1732 2042 114 117 98 34 37 700 6 10
1 1 1 1656 1655 1944 1932 1771 228 217 206 12 1 1324 6 16
1 1 1 1805 1805 1955 1955 678 50 50 30 0 0 162 8 12
1 1 1 1857 1855 2121 2140 1143 144 165 126 -19 2 861 4 14
1 1 1 1915 1658 1949 1725 2476 94 87 72 144 137 369 3 19
1 2 2 608 610 726 714 110 78 64 52 12 -2 258 4 22
1 2 2 610 615 714 739 419 124 144 107 -25 -5 737 4 13
1 2 2 617 620 734 751 2451 77 91 64 -17 -3 451 4 9
1 2 2 620 610 723 717 17 63 67 46 6 10 196 2 15
1 2 2 646 630 814 750 981 88 80 75 24 16 405 6 7
1 2 2 653 651 845 833 1507 172 162 149 12 2 929 11 12
1 2 2 657 700 1145 1158 616 168 178 146 -13 -3 1175 9 13
1 2 2 713 645 853 826 524 100 101 72 27 28 454 7 21
1 2 2 714 715 913 915 1295 119 120 99 -2 -1 631 5 15
1 2 2 953 952 1034 1042 1763 101 110 82 -8 1 589 10 9
1 2 2 1111 1115 1556 1600 1480 225 225 186 -4 -4 1520 11 28
1 2 2 1156 1200 1232 1233 786 36 33 19 -1 -4 55 8 9
1 2 2 1210 1205 1414 1340 5192 124 95 75 34 5 264 17 32
1 2 2 1231 1232 1227 1233 4064 56 61 30 -6 -1 157 21 5
1 2 2 1233 1236 1338 1337 1978 65 61 35 1 -3 190 22 8
1 2 2 1310 1305 1405 1405 34 55 60 44 0 5 276 2 9
1 2 2 1441 1420 1547 1529 707 66 69 46 18 21 237 3 17
1 2 2 1447 1450 1719 1717 2577 152 147 111 2 -3 798 27 14
1 2 2 1457 1500 1604 1616 6866 67 76 40 -12 -3 214 7 20
1 2 2 1500 1500 1650 1743 143 290 343 268 -53 0 2288 7 15
1 2 2 1545 1530 1646 1648 2445 181 198 164 -2 15 1235 6 11
1 2 2 1643 1645 1935 1929 866 112 104 94 6 -2 599 6 12
1 2 2 1745 1700 1925 1831 653 100 91 83 54 45 432 5 12
1 2 2 1747 1748 1817 1811 526 30 23 11 6 -1 32 3 16
1 2 2 1756 1740 2118 2100 1108 142 140 122 18 16 861 8 12
1 2 2 1820 1744 1945 1901 3511 85 77 60 44 36 228 2 23
1 2 2 1844 1845 2100 2055 814 136 130 109 5 -1 786 10 17
1 2 2 1924 1920 2035 2028 758 71 68 35 7 4 193 4 32
1 2 2 1955 1825 2121 1953 1402 86 88 69 88 90 430 3 14
1 2 2 2030 2010 2145 2128 5283 75 78 46 17 20 178 5 24
1 2 2 2155 2147 114 101 70 139 134 117 13 8 872 6 16
1 3 3 305 200 1024 922 936 259 262 237 62 65 1979 6 16
1 3 3 615 615 726 717 1366 71 62 49 9 0 247 9 13