oegedijk / explainerdashboard

Quickly build Explainable AI dashboards that show the inner workings of so-called "blackbox" machine learning models.
http://explainerdashboard.readthedocs.io
MIT License
2.29k stars 331 forks source link

Feature data : pandas.DataFrame column "" has dtype 'category' but is not in cat_features list #232

Closed Tanay0510 closed 1 year ago

Tanay0510 commented 2 years ago

I am creating a catboost model using the following

Cat_features_names = [0,1]

clf = CatBoostClassifier(iterations = 50, random seed = 42, learning_rate = 0.1)

clf.fit(X_train, y_train, cat_features = cat_features_names,
eval_set = (X_test, y_test),
verbose = False,
plot = True)

explainer = ClassifierExplainer(clf, X_test, y_test, cv = 3, model_output = 'logodds', cats=cat_features_names)

db = ExplainerDashboard(explainer, title="xyz", cats= True, hide_cats = True, hide_pdp = True, hide_whatifpdp = True, shap_interaction = False)

db.run(port=8051) 

When i run this, i am getting the following error


CatBoostError                             Traceback (most recent call last)
Input In [120], in <cell line: 1>()
----> 1 db = ExplainerDashboard(explainer, title="Titanic Explainer",cats=True, hide_cats=True, hide_pdp=True, hide_whatifpdp=True, shap_interaction=False)
      2 db.run(port=8051)

File ~/opt/anaconda3/lib/python3.9/site-packages/explainerdashboard/dashboards.py:590, in ExplainerDashboard.__init__(self, explainer, tabs, title, name, description, simple, hide_header, header_hide_title, header_hide_selector, header_hide_download, hide_poweredby, block_selector_callbacks, pos_label, fluid, mode, width, height, bootstrap, external_stylesheets, server, url_base_pathname, responsive, logins, port, importances, model_summary, contributions, whatif, shap_dependence, shap_interaction, decision_trees, **kwargs)
    588 if isinstance(tabs, list):
    589     tabs = [self._convert_str_tabs(tab) for tab in tabs]
--> 590     self.explainer_layout = ExplainerTabsLayout(explainer, tabs, title, 
    591                     description=self.description,
    592                     **update_kwargs(kwargs, 
    593                         header_hide_title=self.header_hide_title, 
    594                         header_hide_selector=self.header_hide_selector, 
    595                         header_hide_download=self.header_hide_download, 
    596                         hide_poweredby=self.hide_poweredby,
    597                         block_selector_callbacks=self.block_selector_callbacks,
    598                         pos_label=self.pos_label,
    599                         fluid=fluid))
    600 else:
    601     tabs = self._convert_str_tabs(tabs)

File ~/opt/anaconda3/lib/python3.9/site-packages/explainerdashboard/dashboards.py:109, in ExplainerTabsLayout.__init__(self, explainer, tabs, title, name, description, header_hide_title, header_hide_selector, header_hide_download, hide_poweredby, block_selector_callbacks, pos_label, fluid, **kwargs)
    105 assert len(self.tabs) > 0, 'When passing a list to tabs, need to pass at least one valid tab!'
    107 self.register_components(*self.tabs)
--> 109 self.downloadable_tabs = [tab for tab in self.tabs if tab.to_html(add_header=False) != "<div></div>"]
    110 if not self.downloadable_tabs:
    111     self.header_hide_download = True

File ~/opt/anaconda3/lib/python3.9/site-packages/explainerdashboard/dashboards.py:109, in <listcomp>(.0)
    105 assert len(self.tabs) > 0, 'When passing a list to tabs, need to pass at least one valid tab!'
    107 self.register_components(*self.tabs)
--> 109 self.downloadable_tabs = [tab for tab in self.tabs if tab.to_html(add_header=False) != "<div></div>"]
    110 if not self.downloadable_tabs:
    111     self.header_hide_download = True

File ~/opt/anaconda3/lib/python3.9/site-packages/explainerdashboard/dashboard_components/composites.py:197, in ClassifierModelStatsComposite.to_html(self, state_dict, add_header)
    194 def to_html(self, state_dict=None, add_header=True):
    195     html = to_html.hide(to_html.title(self.title), hide=self.hide_title)
    196     html += to_html.card_rows(
--> 197         [to_html.hide(self.summary.to_html(state_dict, add_header=False), hide=self.hide_modelsummary),
    198          to_html.hide(self.confusionmatrix.to_html(state_dict, add_header=False), hide=self.hide_confusionmatrix)],
    199         [to_html.hide(self.precision.to_html(state_dict, add_header=False), hide=self.hide_precision), 
    200          to_html.hide(self.classification.to_html(state_dict, add_header=False), hide=self.hide_classification)],
    201         [to_html.hide(self.rocauc.to_html(state_dict, add_header=False), hide=self.hide_rocauc),
    202          to_html.hide(self.prauc.to_html(state_dict, add_header=False), hide=self.hide_prauc)],
    203         [to_html.hide(self.liftcurve.to_html(state_dict, add_header=False), hide=self.hide_liftcurve),
    204          to_html.hide(self.cumulative_precision.to_html(state_dict, add_header=False), hide=self.hide_cumprecision)]
    205     )
    206     if add_header:
    207         return to_html.add_header(html)

File ~/opt/anaconda3/lib/python3.9/site-packages/explainerdashboard/dashboard_components/classifier_components.py:1633, in ClassifierModelSummaryComponent.to_html(self, state_dict, add_header)
   1631 def to_html(self, state_dict=None, add_header=True):
   1632     args = self.get_state_args(state_dict)
-> 1633     metrics_df = self._get_metrics_df(args['cutoff'], args['pos_label'])
   1634     html = to_html.table_from_df(metrics_df)
   1635     html = to_html.card(html, title=self.title)

File ~/opt/anaconda3/lib/python3.9/site-packages/explainerdashboard/dashboard_components/classifier_components.py:1642, in ClassifierModelSummaryComponent._get_metrics_df(self, cutoff, pos_label)
   1640 def _get_metrics_df(self, cutoff, pos_label):
   1641     metrics_df = (pd.DataFrame(
-> 1642                             self.explainer.metrics(cutoff=cutoff, pos_label=pos_label, 
   1643                                                     show_metrics=self.show_metrics), 
   1644                             index=["Score"])
   1645                           .T.rename_axis(index="metric").reset_index()
   1646                           .round(self.round))
   1647     return metrics_df

File ~/opt/anaconda3/lib/python3.9/site-packages/explainerdashboard/explainers.py:60, in insert_pos_label.<locals>.inner(self, *args, **kwargs)
     57     else:  
     58         # insert self.pos_label 
     59         kwargs.update(dict(pos_label=self.pos_label))
---> 60         return func(self, *args, **kwargs)
     61 kwargs.update(dict(zip(inspect.getfullargspec(func).args[1:1+len(args)], args)))
     62 if 'pos_label' in kwargs:

File ~/opt/anaconda3/lib/python3.9/site-packages/explainerdashboard/explainers.py:2679, in ClassifierExplainer.metrics(self, cutoff, show_metrics, pos_label)
   2676                 self._metrics[label][cut] = \
   2677                     get_metrics(0.01*cut, label)
   2678     else:
-> 2679         self._metrics = get_cv_metrics(self.cv)
   2682 if int(cutoff*100) in self._metrics[pos_label]:
   2683     metrics_dict =  self._metrics[pos_label][int(cutoff*100)]

File ~/opt/anaconda3/lib/python3.9/site-packages/explainerdashboard/explainers.py:2651, in ClassifierExplainer.metrics.<locals>.get_cv_metrics(n_splits)
   2649 X_train, X_test = self.X.iloc[train_index], self.X.iloc[test_index]
   2650 y_train, y_test = self.y.iloc[train_index], self.y.iloc[test_index]
-> 2651 preds = clone(self.model).fit(X_train, y_train).predict_proba(X_test)
   2652 for label in range(len(self.labels)):
   2653     for cut in np.linspace(1, 99, 99, dtype=int):

File ~/opt/anaconda3/lib/python3.9/site-packages/catboost/core.py:5007, in CatBoostClassifier.fit(self, X, y, cat_features, text_features, embedding_features, sample_weight, baseline, use_best_model, eval_set, verbose, logging_level, plot, column_description, verbose_eval, metric_period, silent, early_stopping_rounds, save_snapshot, snapshot_file, snapshot_interval, init_model, callbacks, log_cout, log_cerr)
   5004 if 'loss_function' in params:
   5005     CatBoostClassifier._check_is_compatible_loss(params['loss_function'])
-> 5007 self._fit(X, y, cat_features, text_features, embedding_features, None, sample_weight, None, None, None, None, baseline, use_best_model,
   5008           eval_set, verbose, logging_level, plot, column_description, verbose_eval, metric_period,
   5009           silent, early_stopping_rounds, save_snapshot, snapshot_file, snapshot_interval, init_model, callbacks, log_cout, log_cerr)
   5010 return self

File ~/opt/anaconda3/lib/python3.9/site-packages/catboost/core.py:2262, in CatBoost._fit(self, X, y, cat_features, text_features, embedding_features, pairs, sample_weight, group_id, group_weight, subgroup_id, pairs_weight, baseline, use_best_model, eval_set, verbose, logging_level, plot, column_description, verbose_eval, metric_period, silent, early_stopping_rounds, save_snapshot, snapshot_file, snapshot_interval, init_model, callbacks, log_cout, log_cerr)
   2259 if y is None and not isinstance(X, PATH_TYPES + (Pool,)):
   2260     raise CatBoostError("y may be None only when X is an instance of catboost.Pool or string")
-> 2262 train_params = self._prepare_train_params(
   2263     X=X, y=y, cat_features=cat_features, text_features=text_features, embedding_features=embedding_features,
   2264     pairs=pairs, sample_weight=sample_weight, group_id=group_id, group_weight=group_weight,
   2265     subgroup_id=subgroup_id, pairs_weight=pairs_weight, baseline=baseline, use_best_model=use_best_model,
   2266     eval_set=eval_set, verbose=verbose, logging_level=logging_level, plot=plot,
   2267     column_description=column_description, verbose_eval=verbose_eval, metric_period=metric_period,
   2268     silent=silent, early_stopping_rounds=early_stopping_rounds, save_snapshot=save_snapshot,
   2269     snapshot_file=snapshot_file, snapshot_interval=snapshot_interval, init_model=init_model,
   2270     callbacks=callbacks
   2271 )
   2272 params = train_params["params"]
   2273 train_pool = train_params["train_pool"]

File ~/opt/anaconda3/lib/python3.9/site-packages/catboost/core.py:2148, in CatBoost._prepare_train_params(self, X, y, cat_features, text_features, embedding_features, pairs, sample_weight, group_id, group_weight, subgroup_id, pairs_weight, baseline, use_best_model, eval_set, verbose, logging_level, plot, column_description, verbose_eval, metric_period, silent, early_stopping_rounds, save_snapshot, snapshot_file, snapshot_interval, init_model, callbacks)
   2145 text_features = _process_feature_indices(text_features, X, params, 'text_features')
   2146 embedding_features = _process_feature_indices(embedding_features, X, params, 'embedding_features')
-> 2148 train_pool = _build_train_pool(X, y, cat_features, text_features, embedding_features, pairs,
   2149                                sample_weight, group_id, group_weight, subgroup_id, pairs_weight,
   2150                                baseline, column_description)
   2151 if train_pool.is_empty_:
   2152     raise CatBoostError("X is empty.")

File ~/opt/anaconda3/lib/python3.9/site-packages/catboost/core.py:1430, in _build_train_pool(X, y, cat_features, text_features, embedding_features, pairs, sample_weight, group_id, group_weight, subgroup_id, pairs_weight, baseline, column_description)
   1428     if y is None:
   1429         raise CatBoostError("y has not initialized in fit(): X is not catboost.Pool object, y must be not None in fit().")
-> 1430     train_pool = Pool(X, y, cat_features=cat_features, text_features=text_features, embedding_features=embedding_features, pairs=pairs, weight=sample_weight, group_id=group_id,
   1431                       group_weight=group_weight, subgroup_id=subgroup_id, pairs_weight=pairs_weight, baseline=baseline)
   1432 return train_pool

File ~/opt/anaconda3/lib/python3.9/site-packages/catboost/core.py:790, in Pool.__init__(self, data, label, cat_features, text_features, embedding_features, embedding_features_data, column_description, pairs, delimiter, has_header, ignore_csv_quoting, weight, group_id, group_weight, subgroup_id, pairs_weight, baseline, timestamp, feature_names, feature_tags, thread_count, log_cout, log_cerr)
    784         if isinstance(feature_names, PATH_TYPES):
    785             raise CatBoostError(
    786                 "feature_names must be None or have non-string type when the pool is created from "
    787                 "python objects."
    788             )
--> 790         self._init(data, label, cat_features, text_features, embedding_features, embedding_features_data, pairs, weight,
    791                    group_id, group_weight, subgroup_id, pairs_weight, baseline, timestamp, feature_names, feature_tags, thread_count)
    792 super(Pool, self).__init__()

File ~/opt/anaconda3/lib/python3.9/site-packages/catboost/core.py:1411, in Pool._init(self, data, label, cat_features, text_features, embedding_features, embedding_features_data, pairs, weight, group_id, group_weight, subgroup_id, pairs_weight, baseline, timestamp, feature_names, feature_tags, thread_count)
   1409 if feature_tags is not None:
   1410     feature_tags = self._check_transform_tags(feature_tags, feature_names)
-> 1411 self._init_pool(data, label, cat_features, text_features, embedding_features, embedding_features_data, pairs, weight,
   1412                 group_id, group_weight, subgroup_id, pairs_weight, baseline, timestamp, feature_names, feature_tags, thread_count)

File _catboost.pyx:3941, in _catboost._PoolBase._init_pool()

File _catboost.pyx:3991, in _catboost._PoolBase._init_pool()

File _catboost.pyx:3807, in _catboost._PoolBase._init_features_order_layout_pool()

File _catboost.pyx:2731, in _catboost._set_features_order_data_pd_data_frame()

CatBoostError: features data: pandas.DataFrame column 'XYZ' has dtype 'category' but is not in  cat_features list

I don't understand why this is happening. Also if I am not using explainer dashboard and creating model, doing model fit, calculating shap values etc using shap and catboost, I don't get this error so I think there is something wrong with explainer dashboard.

I am stuck at this point and can't figure out if this problem is in catboost library, explainer dashboard or in my code

bratGitHub commented 1 year ago

I guess it was caused by typo. You define Cat_features_names but you call cat_features_names.