ray-project / xgboost_ray

Distributed XGBoost on Ray
Apache License 2.0
143 stars 34 forks source link

XGBoost ray swallows XGBoost errors #97

Closed richardliaw closed 3 years ago

richardliaw commented 3 years ago

Compare:

def train_model(data: Iterable[Data]):
    values, labels = zip(*((d.x, d.label) for d in data))
    le = preprocessing.LabelEncoder()
    le.fit(labels)

    labels = le.transform(labels)
    X_train, X_test, y_train, y_test = train_test_split(
        values, labels, stratify=labels)

    train_set = DMatrix(np.array(X_train), np.array(y_train))
    test_set = DMatrix(np.array(X_test), np.array(y_test))

    evals_result = {}
    bst = xgb.train(
        {
            "objective": "multi:softmax",
            # "objective": "multi:softmax",
            "verbosity": 3,
            "eval_metric": ["logloss", "error"],
            "num_class": len(le.classes_)
        },
        train_set,
        evals_result=evals_result,
        evals=[(train_set, "train"), (test_set, "eval")],
        verbose_eval=False,
        # ray_params=xgbr.RayParams(
        #     num_actors=2,
        #     cpus_per_actor=1)
    )

to the Ray version

def train_model(data: Iterable[Data]):
    values, labels = zip(*((d.x, d.label) for d in data))
    le = preprocessing.LabelEncoder()
    le.fit(labels)

    labels = le.transform(labels)
    X_train, X_test, y_train, y_test = train_test_split(
        values, labels, stratify=labels)

    train_set = RayDMatrix(np.array(X_train), np.array(y_train))
    test_set = RayDMatrix(np.array(X_test), np.array(y_test))

    evals_result = {}
    bst = xgb_ray.train(
        {
            "objective": "multi:softmax",
            # "objective": "multi:softmax",
            "verbosity": 3,
            "eval_metric": ["logloss", "error"],
            "num_class": len(le.classes_)
        },
        train_set,
        evals_result=evals_result,
        evals=[(train_set, "train"), (test_set, "eval")],
        verbose_eval=False,
        ray_params=xgbr.RayParams(
            num_actors=2,
            cpus_per_actor=1)
    )

The first one will show:

  File "/Users/rliaw/dev/ray-summit-demo-2021/summit_2021/train.py", line 26, in train_model
    bst = xgb.train(
  File "/Users/rliaw/miniconda3/envs/demo/lib/python3.8/site-packages/xgboost/training.py", line 189, in train
    bst = _train_internal(params, dtrain,
  File "/Users/rliaw/miniconda3/envs/demo/lib/python3.8/site-packages/xgboost/training.py", line 82, in _train_internal
    if callbacks.after_iteration(bst, i, dtrain, evals):
  File "/Users/rliaw/miniconda3/envs/demo/lib/python3.8/site-packages/xgboost/callback.py", line 432, in after_iteration
    score = model.eval_set(evals, epoch, self.metric)
  File "/Users/rliaw/miniconda3/envs/demo/lib/python3.8/site-packages/xgboost/core.py", line 1562, in eval_set
    _check_call(_LIB.XGBoosterEvalOneIter(self.handle,
  File "/Users/rliaw/miniconda3/envs/demo/lib/python3.8/site-packages/xgboost/core.py", line 210, in _check_call
    raise XGBoostError(py_str(_LIB.XGBGetLastError()))
xgboost.core.XGBoostError: [18:05:05] /Users/travis/build/dmlc/xgboost/src/metric/elementwise_metric.cu:366: Check failed: preds.Size() == info.labels_.Size() (750 vs. 375) : label and prediction size not match, hint: use merror or mlogloss for multi-class classification

But the second one will swallow this error.

richardliaw commented 3 years ago

The second one will have:

2021-05-10 18:02:27,651 INFO elastic.py:156 -- Actor status: 2 alive, 0 dead (2 total)
Traceback (most recent call last):
  File "/Users/rliaw/miniconda3/envs/demo/lib/python3.8/site-packages/xgboost_ray/main.py", line 957, in _train
    ray.get(ready)
  File "/Users/rliaw/miniconda3/envs/demo/lib/python3.8/site-packages/ray/_private/client_mode_hook.py", line 47, in wrapper
    return func(*args, **kwargs)
  File "/Users/rliaw/miniconda3/envs/demo/lib/python3.8/site-packages/ray/worker.py", line 1481, in get
    raise value.as_instanceof_cause()
ray.exceptions.RayTaskError(RayXGBoostTrainingError): ray::RayXGBoostActor.train() (pid=92250, ip=192.168.1.115)
  File "python/ray/_raylet.pyx", line 505, in ray._raylet.execute_task
  File "python/ray/_raylet.pyx", line 449, in ray._raylet.execute_task.function_executor
  File "/Users/rliaw/miniconda3/envs/demo/lib/python3.8/site-packages/ray/_private/function_manager.py", line 556, in actor_method_executor
    return method(__ray_actor, *args, **kwargs)
  File "/Users/rliaw/miniconda3/envs/demo/lib/python3.8/site-packages/xgboost_ray/main.py", line 554, in train
    raise RayXGBoostTrainingError("Training failed.")
xgboost_ray.main.RayXGBoostTrainingError: Training failed.

The above exception was the direct cause of the following exception:

Traceback (most recent call last):
  File "/Users/rliaw/miniconda3/envs/demo/lib/python3.8/site-packages/xgboost_ray/main.py", line 1248, in train
    bst, train_evals_result, train_additional_results = _train(
  File "/Users/rliaw/miniconda3/envs/demo/lib/python3.8/site-packages/xgboost_ray/main.py", line 983, in _train
    raise RayActorError from exc
ray.exceptions.RayActorError: The actor died unexpectedly before finishing this task.

The above exception was the direct cause of the following exception:

Traceback (most recent call last):
  File "scripts/typer_run_pipeline.py", line 23, in <module>
    typer.run(main)
  File "/Users/rliaw/miniconda3/envs/demo/lib/python3.8/site-packages/typer/main.py", line 859, in run
    app()
  File "/Users/rliaw/miniconda3/envs/demo/lib/python3.8/site-packages/typer/main.py", line 214, in __call__
    return get_command(self)(*args, **kwargs)
  File "/Users/rliaw/miniconda3/envs/demo/lib/python3.8/site-packages/click/core.py", line 829, in __call__
    return self.main(*args, **kwargs)
  File "/Users/rliaw/miniconda3/envs/demo/lib/python3.8/site-packages/click/core.py", line 782, in main
    rv = self.invoke(ctx)
  File "/Users/rliaw/miniconda3/envs/demo/lib/python3.8/site-packages/click/core.py", line 1066, in invoke
    return ctx.invoke(self.callback, **ctx.params)
  File "/Users/rliaw/miniconda3/envs/demo/lib/python3.8/site-packages/click/core.py", line 610, in invoke
    return callback(*args, **kwargs)
  File "/Users/rliaw/miniconda3/envs/demo/lib/python3.8/site-packages/typer/main.py", line 497, in wrapper
    return callback(**use_params)  # type: ignore
  File "scripts/typer_run_pipeline.py", line 19, in main
    train_model(processed)
  File "/Users/rliaw/dev/ray-summit-demo-2021/summit_2021/train.py", line 22, in train_model
    bst = xgbr.train(
  File "/Users/rliaw/miniconda3/envs/demo/lib/python3.8/site-packages/xgboost_ray/main.py", line 1319, in train
    raise RuntimeError(
RuntimeError: A Ray actor died during training and the maximum number of retries (0) is exhausted.