dask / dask-xgboost

BSD 3-Clause "New" or "Revised" License
162 stars 43 forks source link

Check column names before passing on #15

Open TomAugspurger opened 6 years ago

TomAugspurger commented 6 years ago

e.g. if they're ints, xgboost will refuse them.

import pandas as pd
import numpy as np
import dask.dataframe as dd
import dask_xgboost as xgb
from distributed import Client

df = pd.DataFrame({0: np.random.randint(0, 2, size=100),
                   1: np.random.uniform(0, 1, size=100),
                   2: np.random.uniform(0, 1, size=100)})
a = dd.from_pandas(df, 2)
labels = a.loc[:, 0]
data = a.loc[:, 1:]

c = Client()

xgb.train(c, {}, data, labels)
ValueError                                Traceback (most recent call last)
<ipython-input-6-ea984a812dfe> in <module>()
     14 c = Client()
     15
---> 16 xgb.train(c, {}, data, labels)

~/sandbox/dask-xgboost/dask_xgboost/core.py in train(client, params, data, labels, dmatrix_kwargs, **kwargs)
    167     """
    168     return sync(client.loop, _train, client, params, data,
--> 169                 labels, dmatrix_kwargs, **kwargs)
    170
    171

~/Envs/dask-dev/lib/python3.6/site-packages/distributed/distributed/utils.py in sync(loop, func, *args, **kwargs)
    252             e.wait(1000000)
    253     if error[0]:
--> 254         six.reraise(*error[0])
    255     else:
    256         return result[0]

~/Envs/dask-dev/lib/python3.6/site-packages/six.py in reraise(tp, value, tb)
    691             if value.__traceback__ is not tb:
    692                 raise value.with_traceback(tb)
--> 693             raise value
    694         finally:
    695             value = None

~/Envs/dask-dev/lib/python3.6/site-packages/distributed/distributed/utils.py in f()
    236             yield gen.moment
    237             thread_state.asynchronous = True
--> 238             result[0] = yield make_coro()
    239         except Exception as exc:
    240             logger.exception(exc)

~/.virtualenvs/dask-dev/lib/python3.6/site-packages/tornado/gen.py in run(self)
   1053
   1054                     try:
-> 1055                         value = future.result()
   1056                     except Exception:
   1057                         self.had_exception = True

~/.virtualenvs/dask-dev/lib/python3.6/site-packages/tornado/concurrent.py in result(self, timeout)
    236         if self._exc_info is not None:
    237             try:
--> 238                 raise_exc_info(self._exc_info)
    239             finally:
    240                 self = None

~/.virtualenvs/dask-dev/lib/python3.6/site-packages/tornado/util.py in raise_exc_info(exc_info)

~/.virtualenvs/dask-dev/lib/python3.6/site-packages/tornado/gen.py in run(self)
   1061                     if exc_info is not None:
   1062                         try:
-> 1063                             yielded = self.gen.throw(*exc_info)
   1064                         finally:
   1065                             # Break up a reference to itself

~/sandbox/dask-xgboost/dask_xgboost/core.py in _train(client, params, data, labels, dmatrix_kwargs, **kwargs)
    132
    133     # Get the results, only one will be non-None
--> 134     results = yield client._gather(futures)
    135     result = [v for v in results if v][0]
    136     raise gen.Return(result)

~/.virtualenvs/dask-dev/lib/python3.6/site-packages/tornado/gen.py in run(self)
   1053
   1054                     try:
-> 1055                         value = future.result()
   1056                     except Exception:
   1057                         self.had_exception = True

~/.virtualenvs/dask-dev/lib/python3.6/site-packages/tornado/concurrent.py in result(self, timeout)
    236         if self._exc_info is not None:
    237             try:
--> 238                 raise_exc_info(self._exc_info)
    239             finally:
    240                 self = None

~/.virtualenvs/dask-dev/lib/python3.6/site-packages/tornado/util.py in raise_exc_info(exc_info)

~/.virtualenvs/dask-dev/lib/python3.6/site-packages/tornado/gen.py in run(self)
   1061                     if exc_info is not None:
   1062                         try:
-> 1063                             yielded = self.gen.throw(*exc_info)
   1064                         finally:
   1065                             # Break up a reference to itself

~/Envs/dask-dev/lib/python3.6/site-packages/distributed/distributed/client.py in _gather(self, futures, errors, direct, local_worker)
   1305                             six.reraise(type(exception),
   1306                                         exception,
-> 1307                                         traceback)
   1308                     if errors == 'skip':
   1309                         bad_keys.add(key)

~/Envs/dask-dev/lib/python3.6/site-packages/six.py in reraise(tp, value, tb)
    690                 value = tp()
    691             if value.__traceback__ is not tb:
--> 692                 raise value.with_traceback(tb)
    693             raise value
    694         finally:

~/sandbox/dask-xgboost/dask_xgboost/core.py in train_part()
     66     labels = concat(labels)
     67     dmatrix_kwargs["feature_names"] = getattr(data, 'columns', None)
---> 68     dtrain = xgb.DMatrix(data, labels, **dmatrix_kwargs)
     69
     70     args = [('%s=%s' % item).encode() for item in env.items()]

~/sandbox/xgboost/python-package/xgboost/core.py in __init__()
    294                 self.set_weight(weight)
    295
--> 296         self.feature_names = feature_names
    297         self.feature_types = feature_types
    298

~/sandbox/xgboost/python-package/xgboost/core.py in feature_names()
    663                        not any(x in f for x in set(('[', ']', '<')))
    664                        for f in feature_names):
--> 665                 raise ValueError('feature_names may not contain [, ] or <')
    666         else:
    667             # reset feature_types also

ValueError: feature_names may not contain [, ] or <
chenzikun commented 5 years ago

hello, is this issue resolved? @TomAugspurger

TomAugspurger commented 5 years ago

Seems like it's still open. Are you interested in working on it?

On Mon, Nov 26, 2018 at 6:10 AM chenzikun notifications@github.com wrote:

hello, is this issue resolved? @TomAugspurger https://github.com/TomAugspurger

— You are receiving this because you were mentioned. Reply to this email directly, view it on GitHub https://github.com/dask/dask-xgboost/issues/15#issuecomment-441617700, or mute the thread https://github.com/notifications/unsubscribe-auth/ABQHIjYwQyef2OsIwHi8otcFsCfcGJJaks5uy9pTgaJpZM4Qq6IO .

chenzikun commented 5 years ago

I use xgboost on single machine, it works well. But comes with the same error with dask-xgboost. I am trying to figure out what was happened

chenzikun commented 5 years ago

~/sandbox/dask-xgboost/dask_xgboost/core.py in train_part() 66 labels = concat(labels) 67 dmatrix_kwargs["feature_names"] = getattr(data, 'columns', None) ---> 68 dtrain = xgb.DMatrix(data, labels, **dmatrix_kwargs)

why send "feature_names" to xgb.DMatrix?

TomAugspurger commented 5 years ago

The feature names are nice to preserve.

On Mon, Nov 26, 2018 at 7:07 AM chenzikun notifications@github.com wrote:

~/sandbox/dask-xgboost/dask_xgboost/core.py in train_part() 66 labels = concat(labels) 67 dmatrix_kwargs["feature_names"] = getattr(data, 'columns', None) ---> 68 dtrain = xgb.DMatrix(data, labels, **dmatrix_kwargs)

why send "feature_names" to xgb.DMatrix?

— You are receiving this because you were mentioned. Reply to this email directly, view it on GitHub https://github.com/dask/dask-xgboost/issues/15#issuecomment-441632498, or mute the thread https://github.com/notifications/unsubscribe-auth/ABQHInLdqxBRmI_ZPx4RvhjUg8fQkKcmks5uy-eGgaJpZM4Qq6IO .