dask / dask-xgboost

BSD 3-Clause "New" or "Revised" License
162 stars 43 forks source link

How can I avoid dask-xgboost no-work status #69

Open wyxandsj opened 4 years ago

wyxandsj commented 4 years ago

I used 2800W samples to train dask xgboost, but the status of training task always shows no work.

    from distributed import Client, progress
from dask.distributed import Client as Client2
import dask.dataframe as dd
import pandas as pd
import dask_xgboost as dxgb

filenames = [ '/data//000000_0','/data//000001_0']
global feature
global y_name
feature=["A","B"]

y_name = ["C"]
client2 = Client2("xx.xx.xx.xx:xx")

def data2dataframe(fn):
    df = pd.read_csv(fn, names =y_name+feature ,na_values='NULL',header=None,sep=',')
    df= df.fillna("0")
    for col in feature+y_name:
        df[col] = df[col].astype("float64")
    return (df[feature], df[y_name])

futures2 = client2.map(data2dataframe, filenames)
results= client2.gather(iter(futures2))

i=0
for re in results:
    if i==0:
        X_trains = re[0]
        y_trains = re[1]
    else:
        X_trains=pd.concat([X_trains,re[0]])
        y_trains=pd.concat([y_trains,re[1]])
    i=i+1

X_trains=dd.from_pandas(X_trains,npartitions=54)
y_trains=dd.from_pandas(y_trains,npartitions=54)
dd_train = X_trains
dd_train_label = y_trains
params = {'objective': 'binary:logistic',
          'max_depth': 1, 'eta': 0.01, 'subsample': 0.5,
          'min_child_weight': 1}

bst = dxgb.train(client2, params, dd_train, dd_train_label,num_boost_round=140)
predictions = dxgb.predict(client2, bst, dd_train)
print(predictions.persist())
TomAugspurger commented 4 years ago

Do you have a minimal example? http://matthewrocklin.com/blog/work/2018/02/28/minimal-bug-reports