Closed jrzaurin closed 6 years ago
reproducing the code directly from Kaggle on the Porto Insurance dataset:
from mlbox.preprocessing import * from mlbox.optimisation import * from mlbox.prediction import * # In[ ]: paths = ["data/train.csv", "data/test.csv"] target_name = "target" # # Now let MLBox do the job ! # ## ... to read and clean all the files # In[ ]: rd = Reader(sep = ",") df = rd.train_test_split(paths, target_name) #reading and preprocessing (dates, ...) # In[ ]: dft = Drift_thresholder() df = dft.fit_transform(df) #removing non-stable features (like ID,...) # ## ... to tune all the hyper-parameters # In[ ]: def gini(actual, pred, cmpcol = 0, sortcol = 1): assert( len(actual) == len(pred) ) all = np.asarray(np.c_[ actual, pred, np.arange(len(actual)) ], dtype=np.float) all = all[ np.lexsort((all[:,2], -1*all[:,1])) ] totalLosses = all[:,0].sum() giniSum = all[:,0].cumsum().sum() / totalLosses giniSum -= (len(actual) + 1) / 2. return giniSum / len(actual) def gini_normalized(a, p): return np.abs(gini(a, p) / gini(a, a)) opt = Optimiser(scoring = make_scorer(gini_normalized, greater_is_better=True, needs_proba=True), n_folds=2) # In[ ]: space = { 'est__strategy':{"search":"choice", "space":["LightGBM"]}, 'est__n_estimators':{"search":"choice", "space":[700]}, 'est__colsample_bytree':{"search":"uniform", "space":[0.77,0.82]}, 'est__subsample':{"search":"uniform", "space":[0.73,0.8]}, 'est__max_depth':{"search":"choice", "space":[5,6,7]}, 'est__learning_rate':{"search":"uniform", "space":[0.008, 0.02]} } params = opt.optimise(space, df, 7)
it returns
--------------------------------------------------------------------------- TypeError Traceback (most recent call last) <ipython-input-43-801cc68c2cb6> in <module>() 16 } 17 ---> 18 params = opt.optimise(space, df, 7) /usr/local/lib/python2.7/site-packages/mlbox/optimisation/optimiser.pyc in optimise(self, space, df, max_evals) 564 space=hyper_space, 565 algo=tpe.suggest, --> 566 max_evals=max_evals) 567 568 # Displaying best_params /usr/local/lib/python2.7/site-packages/hyperopt/fmin.pyc in fmin(fn, space, algo, max_evals, trials, rstate, allow_trials_fmin, pass_expr_memo_ctrl, catch_eval_exceptions, verbose, return_argmin) 312 313 domain = base.Domain(fn, space, --> 314 pass_expr_memo_ctrl=pass_expr_memo_ctrl) 315 316 rval = FMinIter(algo, domain, trials, max_evals=max_evals, /usr/local/lib/python2.7/site-packages/hyperopt/base.pyc in __init__(self, fn, expr, workdir, pass_expr_memo_ctrl, name, loss_target) 784 before = pyll.dfs(self.expr) 785 # -- raises exception if expr contains cycles --> 786 pyll.toposort(self.expr) 787 vh = self.vh = VectorizeHelper(self.expr, self.s_new_ids) 788 # -- raises exception if v_expr contains cycles /usr/local/lib/python2.7/site-packages/hyperopt/pyll/base.pyc in toposort(expr) 713 G.add_edges_from([(n_in, node) for n_in in node.inputs()]) 714 order = nx.topological_sort(G) --> 715 assert order[-1] == expr 716 return order 717 TypeError: 'generator' object has no attribute '__getitem__'
Maybe related with the hyperopt version?
Nice library though 👍
Actually my apologies, has to do with the networkx version, I downgraded:
pip install networkx==1.11
and off we go!
reproducing the code directly from Kaggle on the Porto Insurance dataset:
it returns
Maybe related with the hyperopt version?
Nice library though 👍