dask / dask-ml

Scalable Machine Learning with Dask
http://ml.dask.org
BSD 3-Clause "New" or "Revised" License
892 stars 255 forks source link

LinearRegression does not work with dask dataframe.values #325

Closed js3711 closed 6 years ago

js3711 commented 6 years ago

Hello,

I am trying to fit a linear regression model from a dask dataframe because my data will not fit into local memory.

import sklearn.datasets as sk_datasets
import dask.dataframe as dd
X, y = sk_datasets.make_classification(n_samples=10000, n_informative=12, 
                                       n_redundant=18, n_features=30)
df_x = dd.from_array(X)
df_y = dd.from_array(y)
dask_x = df_x.values
dask_y = df_y.values
lr = LinearRegression(fit_intercept=True)
lr.fit(dask_x, dask_y)

This throws:

~/anaconda3/envs/correlation_exploration/lib/python3.6/site-packages/dask_glm/utils.py in add_intercept(X)
    145 def add_intercept(X):
    146     if np.isnan(np.sum(X.shape)):
--> 147         raise NotImplementedError("Can not add intercept to array with "
    148                                   "unknown chunk shape")
    149     j, k = X.chunks

NotImplementedError: Can not add intercept to array with unknown chunk shape
lr = LinearRegression(fit_intercept=False)
lr.fit(dask_x, dask_y)

This throws:

---------------------------------------------------------------------------
AttributeError                            Traceback (most recent call last)
<ipython-input-35-fedc331fe3fb> in <module>()
      1 lr = LinearRegression(fit_intercept=False)
      2 
----> 3 lr.fit(X_values, y_values)

~/anaconda3/envs/correlation_exploration/lib/python3.6/site-packages/dask_ml/linear_model/glm.py in fit(self, X, y)
    155         solver_kwargs = self._get_solver_kwargs()
    156 
--> 157         self._coef = algorithms._solvers[self.solver](X, y, **solver_kwargs)
    158         if self.fit_intercept:
    159             self.coef_ = self._coef[:-1]

~/anaconda3/envs/correlation_exploration/lib/python3.6/site-packages/dask_glm/utils.py in normalize_inputs(X, y, *args, **kwargs)
     24             mean = mean if len(intercept_idx[0]) else np.zeros(mean.shape)
     25             Xn = (X - mean) / std
---> 26             out = algo(Xn, y, *args, **kwargs).copy()
     27             i_adj = np.sum(out * mean / std)
     28             out[intercept_idx] -= i_adj

~/anaconda3/envs/correlation_exploration/lib/python3.6/site-packages/dask_glm/algorithms.py in admm(X, y, regularizer, lamduh, rho, over_relax, max_iter, abstol, reltol, family, **kwargs)
    262                                            fprime=fprime) for
    263                      xx, yy, bb, uu in zip(XD, yD, betas, u)]
--> 264         new_betas = np.array(da.compute(*new_betas))
    265 
    266         beta_hat = over_relax * new_betas + (1 - over_relax) * z

~/anaconda3/envs/correlation_exploration/lib/python3.6/site-packages/dask/base.py in compute(*args, **kwargs)
    400     keys = [x.__dask_keys__() for x in collections]
    401     postcomputes = [x.__dask_postcompute__() for x in collections]
--> 402     results = schedule(dsk, keys, **kwargs)
    403     return repack([f(r, *a) for r, (f, a) in zip(results, postcomputes)])
    404 

~/anaconda3/envs/correlation_exploration/lib/python3.6/site-packages/dask/threaded.py in get(dsk, result, cache, num_workers, **kwargs)
     73     results = get_async(pool.apply_async, len(pool._pool), dsk, result,
     74                         cache=cache, get_id=_thread_get_id,
---> 75                         pack_exception=pack_exception, **kwargs)
     76 
     77     # Cleanup pools associated to dead threads

~/anaconda3/envs/correlation_exploration/lib/python3.6/site-packages/dask/local.py in get_async(apply_async, num_workers, dsk, result, cache, get_id, rerun_exceptions_locally, pack_exception, raise_exception, callbacks, dumps, loads, **kwargs)
    503                         _execute_task(task, data)  # Re-execute locally
    504                     else:
--> 505                         raise_exception(exc, tb)
    506                 res, worker_id = loads(res_info)
    507                 state['cache'][key] = res

~/anaconda3/envs/correlation_exploration/lib/python3.6/site-packages/dask/compatibility.py in reraise(exc, tb)
     67         if exc.__traceback__ is not tb:
     68             raise exc.with_traceback(tb)
---> 69         raise exc
     70 
     71 else:

~/anaconda3/envs/correlation_exploration/lib/python3.6/site-packages/dask/local.py in execute_task(key, task_info, dumps, loads, get_id, pack_exception)
    272     try:
    273         task, data = loads(task_info)
--> 274         result = _execute_task(task, data)
    275         id = get_id()
    276         result = dumps((result, id))

~/anaconda3/envs/correlation_exploration/lib/python3.6/site-packages/dask/local.py in _execute_task(arg, cache, dsk)
    253         func, args = arg[0], arg[1:]
    254         args2 = [_execute_task(a, cache) for a in args]
--> 255         return func(*args2)
    256     elif not ishashable(arg):
    257         return arg

~/anaconda3/envs/correlation_exploration/lib/python3.6/site-packages/dask/compatibility.py in apply(func, args, kwargs)
     48     def apply(func, args, kwargs=None):
     49         if kwargs:
---> 50             return func(*args, **kwargs)
     51         else:
     52             return func(*args)

~/anaconda3/envs/correlation_exploration/lib/python3.6/site-packages/dask_glm/algorithms.py in local_update(X, y, beta, z, u, rho, f, fprime, solver)
    297     beta, f, d = solver(f, beta, fprime=fprime, args=solver_args,
    298                         maxiter=200,
--> 299                         maxfun=250)
    300 
    301     return beta

~/anaconda3/envs/correlation_exploration/lib/python3.6/site-packages/scipy/optimize/lbfgsb.py in fmin_l_bfgs_b(func, x0, fprime, args, approx_grad, bounds, m, factr, pgtol, epsilon, iprint, maxfun, maxiter, disp, callback, maxls)
    197 
    198     res = _minimize_lbfgsb(fun, x0, args=args, jac=jac, bounds=bounds,
--> 199                            **opts)
    200     d = {'grad': res['jac'],
    201          'task': res['message'],

~/anaconda3/envs/correlation_exploration/lib/python3.6/site-packages/scipy/optimize/lbfgsb.py in _minimize_lbfgsb(fun, x0, args, jac, bounds, disp, maxcor, ftol, gtol, eps, maxfun, maxiter, iprint, callback, maxls, **unknown_options)
    333             # until the completion of the current minimization iteration.
    334             # Overwrite f and g:
--> 335             f, g = func_and_grad(x)
    336         elif task_str.startswith(b'NEW_X'):
    337             # new iteration

~/anaconda3/envs/correlation_exploration/lib/python3.6/site-packages/scipy/optimize/lbfgsb.py in func_and_grad(x)
    283     else:
    284         def func_and_grad(x):
--> 285             f = fun(x, *args)
    286             g = jac(x, *args)
    287             return f, g

~/anaconda3/envs/correlation_exploration/lib/python3.6/site-packages/scipy/optimize/optimize.py in function_wrapper(*wrapper_args)
    291     def function_wrapper(*wrapper_args):
    292         ncalls[0] += 1
--> 293         return function(*(wrapper_args + args))
    294 
    295     return ncalls, function_wrapper

~/anaconda3/envs/correlation_exploration/lib/python3.6/site-packages/dask_glm/algorithms.py in wrapped(beta, X, y, z, u, rho)
    231         @functools.wraps(func)
    232         def wrapped(beta, X, y, z, u, rho):
--> 233             return func(beta, X, y) + (rho / 2) * np.dot(beta - z + u,
    234                                                          beta - z + u)
    235         return wrapped

~/anaconda3/envs/correlation_exploration/lib/python3.6/site-packages/dask_glm/families.py in pointwise_loss(beta, X, y)
     62     @staticmethod
     63     def pointwise_loss(beta, X, y):
---> 64         beta, y = beta.ravel(), y.ravel()
     65         Xbeta = X.dot(beta)
     66         return Normal.loglike(Xbeta, y)

AttributeError: 'tuple' object has no attribute 'ravel'
js3711 commented 6 years ago
Dependencies # Name Version Build Channel
appdirs 1.4.3 py36h28b3542_0
appnope 0.1.0 py36hf537a9a_0
asn1crypto 0.24.0 py36_0
attrs 18.1.0 py36_0
automat 0.7.0 py36_0
backcall 0.1.0 py36_0
blas 1.0 mkl
bleach 2.1.3 py36_0
bokeh 0.13.0 py36_0
ca-certificates 2018.03.07 0
cairo 1.14.12 hc4e6be7_4
certifi 2018.4.16 py36_0
cffi 1.11.5 py36h342bebf_0
chardet 3.0.4
click 6.7 py36hec950be_0
cloudpickle 0.5.3 py36_0
constantly 15.1.0 py36h28b3542_0
cryptography 2.2.2 py36h1de35cc_0
cycler 0.10.0 py36hfc81398_0
cytoolz 0.9.0.1 py36h1de35cc_1
dask 0.18.2 py36_0
dask-core 0.18.2 py36_0
dask-glm 0.1.0 py36_0
dask-ml 0.7.0 py36h1de35cc_0
dask-searchcv 0.2.0 py36_0
decorator 4.3.0 py36_0
distributed 1.22.0 py36_0
entrypoints 0.2.3 py36_2
expat 2.2.5 hb8e80ba_0
fontconfig 2.13.0 h5d5b041_1
freetype 2.9.1 hb4e5f40_0
fribidi 1.0.4 h1de35cc_0
gettext 0.19.8.1 h15daf44_3
glib 2.56.1 h35bc53a_0
graphite2 1.3.11 h2098e52_2
graphviz 2.40.1 hefbbd9a_2
harfbuzz 1.7.6 hb8d4a28_3
heapdict 1.0.0 py36_2
html5lib 1.0.1 py36_0
hyperlink 18.0.0 py36_0
icu 58.2 h4b95b61_1
idna 2.7 py36_0
incremental 17.5.0 py36_0
intel-openmp 2018.0.3 0
ipykernel 4.8.2 py36_0
ipython 6.4.0 py36_1
ipython_genutils 0.2.0 py36h241746c_0
ipywidgets 7.3.0 py36_0
jedi 0.12.1 py36_0
jinja2 2.10 py36_0
jpeg 9b he5867d9_2
jsonschema 2.6.0 py36hb385e00_0
jupyter_client 5.2.3 py36_0
jupyter_core 4.4.0 py36_0
jupyterlab 0.32.1 py36_0
jupyterlab_launcher 0.10.5 py36_0
kiwisolver 1.0.1 py36h0a44026_0
libcxx 4.0.1 h579ed51_0
libcxxabi 4.0.1 hebd6815_0
libedit 3.1.20170329 hb402a30_2
libffi 3.2.1 h475c297_4
libgfortran 3.0.1 h93005f0_2
libiconv 1.15 hdd342a3_7
libpng 1.6.34 he12f830_0
libsodium 1.0.16 h3efe00b_0
libtiff 4.0.9 hcb84e12_1
libxml2 2.9.8 hab757c2_1
locket 0.2.0 py36hca03003_1
markupsafe 1.0 py36h1de35cc_1
matplotlib 2.2.2 py36hbf02d85_2
mistune 0.8.3 py36h1de35cc_1
mkl 2018.0.3 1
mkl_fft 1.0.2 py36h6b9c3cc_0
mkl_random 1.0.1 py36h5d10147_1
msgpack-python 0.5.6 py36h04f5b5a_0
multipledispatch 0.5.0 py36_0
nbconvert 5.3.1 py36_0
nbformat 4.4.0 py36h827af21_0
ncurses 6.1 h0a44026_0
networkx 2.1 py36_0
notebook 5.6.0 py36_0
numpy 1.14.5 py36h648b28d_4
numpy-base 1.14.5 py36ha9ae307_4
openssl 1.0.2o h26aff7b_0
packaging 17.1 py36_0
pandas 0.23.3 py36h6440ff4_0
pandoc 2.2.1 h1a437c5_0
pandocfilters 1.4.2 py36_1
pango 1.42.1 he2d0c7e_2
parso 0.3.1 py36_0
partd 0.3.8 py36hf5c4cb8_0
pcre 8.42 h378b8a2_0
pexpect 4.6.0 py36_0
pickleshare 0.7.4 py36hf512f8e_0
pip 10.0.1 py36_0
pixman 0.34.0 hca0a616_3
plotly 3.0.0rc11
prometheus_client 0.2.0 py36_0
prompt_toolkit 1.0.15 py36haeda067_0
psutil 5.4.6 py36h1de35cc_0
ptyprocess 0.6.0 py36_0
pyasn1 0.4.3 py36_0
pyasn1-modules 0.2.2 py36_0
pycparser 2.18 py36_1
pygments 2.2.0 py36h240cd3f_0
pygraphviz 1.3 py36h1de35cc_1
pyopenssl 18.0.0 py36_0
pyparsing 2.2.0 py36_1
python 3.6.6 hc167b69_0
python-dateutil 2.7.3 py36_0
python.app 2 py36_8
pytz 2018.5 py36_0
pyyaml 3.13 py36h1de35cc_0
pyzmq 17.0.0 py36h1de35cc_3
readline 7.0 hc1231fa_4
requests 2.19.1
retrying 1.3.3
scikit-learn 0.19.1 py36hf9f1f73_0
scipy 1.1.0 py36hf1f7d93_0
send2trash 1.5.0 py36_0
service_identity 17.0.0 py36h28b3542_0
setuptools 39.2.0 py36_0
simplegeneric 0.8.1 py36_2
six 1.11.0 py36_1
sortedcontainers 2.0.4 py36_0
sqlite 3.24.0 ha441bb4_0
tblib 1.3.2 py36hda67792_0
terminado 0.8.1 py36_1
testpath 0.3.1 py36h625a49b_0
tk 8.6.7 h35a86e2_3
toolz 0.9.0 py36_0
tornado 5.0.2 py36h1de35cc_0
traitlets 4.3.2 py36h65bd3ce_0
twisted 17.5.0 py36_0
urllib3 1.23
wcwidth 0.1.7 py36h8c6ec74_0
webencodings 0.5.1 py36_1
wheel 0.31.1 py36_0
widgetsnbextension 3.3.0 py36_0
xz 5.2.4 h1de35cc_4
yaml 0.1.7 hc338f04_2
zeromq 4.2.5 h0a44026_0
zict 0.1.3 py36_0
zlib 1.2.11 hf3cbc9b_2
zope 1.0 py36_0
zope.interface 4.5.0 py36h1de35cc_0
TomAugspurger commented 6 years ago

Just using df_x and df_y works, correct?


some context: dask dataframe doesn't know its own length,so doing df_x.values results in a dask array with unknown length. We can't concatenate an array of ones to X in that case, since we don't know how long to make the ones.

I plan to implement something like https://github.com/dask/dask/issues/3090 later today. There's the related https://github.com/dask/dask/issues/3293 issue.

mrocklin commented 6 years ago

Would it be possible for dask-ml to add a simple intercept term without having to fully compute things? This seems like the sort of thing that should be possible with map_blocks and a custom function. This seems common enough that forcing computation might be considered a usability bug.

On Mon, Jul 30, 2018 at 9:47 AM, Tom Augspurger notifications@github.com wrote:

Just using df_x and df_y works, correct?

some context: dask dataframe doesn't know its own length,so doing df_x.values results in a dask array with unknown length. We can't concatenate an array of ones to X in that case, since we don't know how long to make the ones.

I plan to implement something like dask/dask#3090 https://github.com/dask/dask/issues/3090 later today. There's the related dask/dask#3293 https://github.com/dask/dask/issues/3293 issue.

— You are receiving this because you are subscribed to this thread. Reply to this email directly, view it on GitHub https://github.com/dask/dask-ml/issues/325#issuecomment-408931936, or mute the thread https://github.com/notifications/unsubscribe-auth/AASszCPSyrINhlHmxUrZZiBKOcwbUVCVks5uLzixgaJpZM4Vmwy_ .

TomAugspurger commented 6 years ago

Yeah, using map_blocks should be sufficient here. I can take a look at that now.

js3711 commented 6 years ago

I can perform operations on df_x and df_y but:

lr = LinearRegression(fit_intercept=True)
lr.fit(df_x, df_y)

---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
<ipython-input-66-7f17fbaf27c3> in <module>()
      1 lr = LinearRegression(fit_intercept=True)
      2 
----> 3 lr.fit(df_x, df_y)

~/anaconda3/envs/correlation_exploration/lib/python3.6/site-packages/dask_ml/linear_model/glm.py in fit(self, X, y)
    151         self : objectj
    152         """
--> 153         X = self._check_array(X)
    154 
    155         solver_kwargs = self._get_solver_kwargs()

~/anaconda3/envs/correlation_exploration/lib/python3.6/site-packages/dask_ml/linear_model/glm.py in _check_array(self, X)
    167             X = add_intercept(X)
    168 
--> 169         return check_array(X, accept_unknown_chunks=True)
    170 
    171 

~/anaconda3/envs/correlation_exploration/lib/python3.6/site-packages/dask_ml/utils.py in check_array(array, *args, **kwargs)
    139     elif isinstance(array, dd.DataFrame):
    140         if not accept_dask_dataframe:
--> 141             raise TypeError
    142 
    143         # TODO: sample?

TypeError: 
TomAugspurger commented 6 years ago

Fixed on master @js3711. Thanks for the report.