Minyus / causallift

CausalLift: Python package for causality-based Uplift Modeling in real-world business
https://causallift.readthedocs.io/
Other
338 stars 42 forks source link

XGBooster Invalid missing value: null #23

Open Peccer opened 1 year ago

Peccer commented 1 year ago

Running:

print('\n[Create 2 models for treatment and untreatment and estimate CATE (Conditional Average Treatment Effects)]') train_df, test_df = cl.estimate_cate_by_2_models()

gives below error. Ran the example notebook from the github project

╭─────────────────────────────── Traceback (most recent call last) ────────────────────────────────╮ │ /tmp/ipykernel_121/3555275851.py:5 in │ │ │ │ [Errno 2] No such file or directory: '/tmp/ipykernel_121/3555275851.py' │ │ │ │ /root/venv/lib/python3.9/site-packages/causallift/causal_lift.py:654 in │ │ estimate_cate_by_2_models │ │ │ │ 651 │ │ │ ) │ │ 652 │ │ │ │ 653 │ │ if self.runner: │ │ ❱ 654 │ │ │ self.kedro_context.run(tags=["311_fit", "312_bundle_2_models"]) │ │ 655 │ │ │ self.uplift_models_dict = self.kedro_context.catalog.load( │ │ 656 │ │ │ │ "uplift_models_dict" │ │ 657 │ │ │ ) │ │ │ │ /root/venv/lib/python3.9/site-packages/causallift/context/flexible_context.py:178 in run │ │ │ │ 175 │ │ │ + "only_missing: {}".format(only_missing) │ │ 176 │ │ │ + ")" │ │ 177 │ │ ) │ │ ❱ 178 │ │ return super().run( │ │ 179 │ │ │ tags=tags, runner=runner, node_names=node_names, only_missing=only_missing │ │ 180 │ │ ) │ │ 181 │ │ │ │ /root/venv/lib/python3.9/site-packages/causallift/context/flexible_context.py:141 in run │ │ │ │ 138 │ │ self, kwargs # type: Any │ │ 139 │ ): │ │ 140 │ │ # type: (...) -> Dict[str, Any] │ │ ❱ 141 │ │ d = super().run(kwargs) │ │ 142 │ │ self.catalog.add_feed_dict(d, replace=True) │ │ 143 │ │ return d │ │ 144 │ │ │ │ /root/venv/lib/python3.9/site-packages/causallift/context/flexible_context.py:131 in run │ │ │ │ 128 │ │ │ runner = ( │ │ 129 │ │ │ │ ParallelRunner() if runner == "ParallelRunner" else SequentialRunner() │ │ 130 │ │ │ ) │ │ ❱ 131 │ │ return super().run(runner=runner, **kwargs) │ │ 132 │ │ 133 │ │ 134 class ProjectContext2(ProjectContext1): │ │ │ │ /root/venv/lib/python3.9/site-packages/causallift/context/flexible_context.py:106 in run │ │ │ │ 103 │ │ runner = runner or SequentialRunner() │ │ 104 │ │ if only_missing and _skippable(self.catalog): │ │ 105 │ │ │ return runner.run_only_missing(pipeline, self.catalog) │ │ ❱ 106 │ │ return runner.run(pipeline, self.catalog) │ │ 107 │ │ 108 │ │ 109 def _skippable( │ │ │ │ /root/venv/lib/python3.9/site-packages/kedro/runner/runner.py:88 in run │ │ │ │ 85 │ │ │ self._logger.info( │ │ 86 │ │ │ │ "Asynchronous mode is enabled for loading and saving data" │ │ 87 │ │ │ ) │ │ ❱ 88 │ │ self._run(pipeline, catalog, hook_manager, session_id) │ │ 89 │ │ │ │ 90 │ │ self._logger.info("Pipeline execution completed successfully.") │ │ 91 │ │ │ │ /root/venv/lib/python3.9/site-packages/kedro/runner/sequential_runner.py:70 in _run │ │ │ │ 67 │ │ │ │ 68 │ │ for exec_index, node in enumerate(nodes): │ │ 69 │ │ │ try: │ │ ❱ 70 │ │ │ │ run_node(node, catalog, hook_manager, self._is_async, session_id) │ │ 71 │ │ │ │ done_nodes.add(node) │ │ 72 │ │ │ except Exception: │ │ 73 │ │ │ │ self._suggest_resume_scenario(pipeline, done_nodes, catalog) │ │ │ │ /root/venv/lib/python3.9/site-packages/kedro/runner/runner.py:304 in run_node │ │ │ │ 301 │ if is_async: │ │ 302 │ │ node = _run_node_async(node, catalog, hook_manager, session_id) │ │ 303 │ else: │ │ ❱ 304 │ │ node = _run_node_sequential(node, catalog, hook_manager, session_id) │ │ 305 │ │ │ 306 │ for name in node.confirms: │ │ 307 │ │ catalog.confirm(name) │ │ │ │ /root/venv/lib/python3.9/site-packages/kedro/runner/runner.py:398 in _run_node_sequential │ │ │ │ 395 │ ) │ │ 396 │ inputs.update(additional_inputs) │ │ 397 │ │ │ ❱ 398 │ outputs = _call_node_run( │ │ 399 │ │ node, catalog, inputs, is_async, hook_manager, session_id=session_id │ │ 400 │ ) │ │ 401 │ │ │ │ /root/venv/lib/python3.9/site-packages/kedro/runner/runner.py:366 in _call_node_run │ │ │ │ 363 │ │ │ is_async=is_async, │ │ 364 │ │ │ session_id=session_id, │ │ 365 │ │ ) │ │ ❱ 366 │ │ raise exc │ │ 367 │ hook_manager.hook.after_node_run( │ │ 368 │ │ node=node, │ │ 369 │ │ catalog=catalog, │ │ │ │ /root/venv/lib/python3.9/site-packages/kedro/runner/runner.py:356 in _call_node_run │ │ │ │ 353 ) -> Dict[str, Any]: │ │ 354 │ # pylint: disable=too-many-arguments │ │ 355 │ try: │ │ ❱ 356 │ │ outputs = node.run(inputs) │ │ 357 │ except Exception as exc: │ │ 358 │ │ hook_manager.hook.on_node_error( │ │ 359 │ │ │ error=exc, │ │ │ │ /root/venv/lib/python3.9/site-packages/kedro/pipeline/node.py:353 in run │ │ │ │ 350 │ │ # purposely catch all exceptions │ │ 351 │ │ except Exception as exc: │ │ 352 │ │ │ self._logger.error("Node '%s' failed with error: \n%s", str(self), str(exc)) │ │ ❱ 353 │ │ │ raise exc │ │ 354 │ │ │ 355 │ def _run_with_no_inputs(self, inputs: Dict[str, Any]): │ │ 356 │ │ if inputs: │ │ │ │ /root/venv/lib/python3.9/site-packages/kedro/pipeline/node.py:344 in run │ │ │ │ 341 │ │ │ elif isinstance(self._inputs, str): │ │ 342 │ │ │ │ outputs = self._run_with_one_input(inputs, self._inputs) │ │ 343 │ │ │ elif isinstance(self._inputs, list): │ │ ❱ 344 │ │ │ │ outputs = self._run_with_list(inputs, self._inputs) │ │ 345 │ │ │ elif isinstance(self._inputs, dict): │ │ 346 │ │ │ │ outputs = self._run_with_dict(inputs, self._inputs) │ │ 347 │ │ │ │ /root/venv/lib/python3.9/site-packages/kedro/pipeline/node.py:384 in _run_with_list │ │ │ │ 381 │ │ │ │ f"{sorted(inputs.keys())}." │ │ 382 │ │ │ ) │ │ 383 │ │ # Ensure the function gets the inputs in the correct order │ │ ❱ 384 │ │ return self._func((inputs[item] for item in node_inputs)) │ │ 385 │ │ │ 386 │ def _run_with_dict(self, inputs: Dict[str, Any], node_inputs: Dict[str, str]): │ │ 387 │ │ # Node inputs and provided run inputs should completely overlap │ │ │ │ /root/venv/lib/python3.9/site-packages/causallift/nodes/model_for_each.py:234 in │ │ model_for_treated_fit │ │ │ │ 231 │ │ 232 │ │ 233 def model_for_treated_fit(posargs, kwargs): │ │ ❱ 234 │ return ModelForTreated().fit(*posargs, *kwargs) │ │ 235 │ │ 236 │ │ 237 def model_for_treated_predict_proba(posargs, kwargs): │ │ │ │ /root/venv/lib/python3.9/site-packages/causallift/nodes/model_for_each.py:94 in fit │ │ │ │ 91 │ │ │ else: │ │ 92 │ │ │ │ log.info("## Feature importances not available.") │ │ 93 │ │ │ │ ❱ 94 │ │ y_pred_train = model.predict(X_train) │ │ 95 │ │ │ │ 96 │ │ y_test = None │ │ 97 │ │ y_pred_test = None │ │ │ │ /shared-libs/python3.9/py/lib/python3.9/site-packages/sklearn/model_selection/_search.py:500 in │ │ predict │ │ │ │ 497 │ │ │ the best found parameters. │ │ 498 │ │ """ │ │ 499 │ │ check_is_fitted(self) │ │ ❱ 500 │ │ return self.bestestimator.predict(X) │ │ 501 │ │ │ 502 │ @available_if(_estimator_has("predict_proba")) │ │ 503 │ def predict_proba(self, X): │ │ │ │ /root/venv/lib/python3.9/site-packages/xgboost/sklearn.py:1434 in predict │ │ │ │ 1431 │ │ base_margin: Optional[ArrayLike] = None, │ │ 1432 │ │ iteration_range: Optional[Tuple[int, int]] = None, │ │ 1433 │ ) -> np.ndarray: │ │ ❱ 1434 │ │ class_probs = super().predict( │ │ 1435 │ │ │ X=X, │ │ 1436 │ │ │ output_margin=output_margin, │ │ 1437 │ │ │ ntree_limit=ntree_limit, │ │ │ │ /root/venv/lib/python3.9/site-packages/xgboost/sklearn.py:1049 in predict │ │ │ │ 1046 │ │ iteration_range = self._get_iteration_range(iteration_range) │ │ 1047 │ │ if self._can_use_inplace_predict(): │ │ 1048 │ │ │ try: │ │ ❱ 1049 │ │ │ │ predts = self.get_booster().inplace_predict( │ │ 1050 │ │ │ │ │ data=X, │ │ 1051 │ │ │ │ │ iteration_range=iteration_range, │ │ 1052 │ │ │ │ │ predict_type="margin" if output_margin else "value", │ │ │ │ /root/venv/lib/python3.9/site-packages/xgboost/core.py:2147 in inplace_predict │ │ │ │ 2144 │ │ if isinstance(data, np.ndarray): │ │ 2145 │ │ │ from .data import _ensure_npdtype │ │ 2146 │ │ │ data, = _ensure_np_dtype(data, data.dtype) │ │ ❱ 2147 │ │ │ _check_call( │ │ 2148 │ │ │ │ _LIB.XGBoosterPredictFromDense( │ │ 2149 │ │ │ │ │ self.handle, │ │ 2150 │ │ │ │ │ _array_interface(data), │ │ │ │ /root/venv/lib/python3.9/site-packages/xgboost/core.py:246 in _check_call │ │ │ │ 243 │ │ return value from API calls │ │ 244 │ """ │ │ 245 │ if ret != 0: │ │ ❱ 246 │ │ raise XGBoostError(py_str(_LIB.XGBGetLastError())) │ │ 247 │ │ 248 │ │ 249 def _has_categorical(booster: "Booster", data: DataType) -> bool: │ ╰──────────────────────────────────────────────────────────────────────────────────────────────────╯ XGBoostError: [12:04:08] ../src/c_api/c_api_utils.h:159: Invalid missing value: null Stack trace: [bt] (0) /root/venv/lib/python3.9/site-packages/xgboost/lib/libxgboost.so(+0xbbec9) [0x7f5d31953ec9] [bt] (1) /root/venv/lib/python3.9/site-packages/xgboost/lib/libxgboost.so(+0xdeb90) [0x7f5d31976b90] [bt] (2) /root/venv/lib/python3.9/site-packages/xgboost/lib/libxgboost.so(+0xe45d8) [0x7f5d3197c5d8] [bt] (3) /root/venv/lib/python3.9/site-packages/xgboost/lib/libxgboost.so(XGBoosterPredictFromDense+0x330) [0x7f5d3195c4d0] [bt] (4) /usr/lib/x86_64-linux-gnu/libffi.so.6(ffi_call_unix64+0x4c) [0x7f5dccad38ee] [bt] (5) /usr/lib/x86_64-linux-gnu/libffi.so.6(ffi_call+0x22f) [0x7f5dccad32bf] [bt] (6) /usr/local/lib/python3.9/lib-dynload/_ctypes.cpython-39-x86_64-linux-gnu.so(+0x13111) [0x7f5dccaf1111] [bt] (7) /usr/local/lib/python3.9/lib-dynload/_ctypes.cpython-39-x86_64-linux-gnu.so(+0x81ed) [0x7f5dccae61ed] [bt] (8) /usr/local/lib/libpython3.9.so.1.0(_PyObject_MakeTpCall+0x79) [0x7f5dcdd1ced9]

Minyus commented 1 year ago

HI @Peccer ,

Thank you for reported the issue. Recent versions of xgboost may not work. Could you try older version released 1-2 years ago?

Besides, Python 3.9 may or may not work with CausalLift. The latest tested version of Python is 3.7.