dask / dask-ml

Scalable Machine Learning with Dask
http://ml.dask.org
BSD 3-Clause "New" or "Revised" License
906 stars 256 forks source link

PCA+pipeline+GridSearchCV error #629

Open rmg55 opened 4 years ago

rmg55 commented 4 years ago

There seems to be an issue with sklearn PCA+pipeline and dask_ml gridsearchCV. Please see my example below. Apologies if I am totally missing something.

Relevant Versions:
``` dask: 2.12.0 dask_ml: 1.2.0 sklearn: 0.22.2.post1 ```

Minimal Example:

The following code shows that

from dask.distributed import Client, LocalCluster
from dask_ml.model_selection import GridSearchCV as dask_GS
from sklearn.model_selection import GridSearchCV as sk_GS
from sklearn.datasets import make_multilabel_classification
from sklearn import svm
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
from sklearn.feature_selection import SelectKBest, f_classif
import numpy as np

cluster = LocalCluster(n_workers=2,threads_per_worker=2)
client = Client(cluster)

X, Y = make_multilabel_classification(n_classes=12, n_labels=1,n_features=271,
                                      n_samples=1200,
                                      random_state=1)
Y = Y.sum(axis=1)

N_FEATURES_OPTIONS_pca = np.arange(10)[1::3]
N_FEATURES_OPTIONS_sel = np.arange(10)[1::3]
Cs = [1,10,100.]
gammas = [.001,0.01]

pca = PCA(iterated_power='auto')
selection = SelectKBest(f_classif)
svc = svm.SVC()

pipe1 = Pipeline([
    ('reduce_dim', selection),
    ('classify', svc)])

pipe2 = Pipeline([
    ('reduce_dim', pca),
    ('classify', svc)])

param_grid1 = [{'reduce_dim__k': N_FEATURES_OPTIONS_sel,
               'classify__C': Cs,
               'classify__gamma': gammas}]
param_grid2 = [{'reduce_dim__n_components': N_FEATURES_OPTIONS_pca,
               'classify__C': Cs,
               'classify__gamma': gammas}]

#Sklearn Gridsearch with PCA pipeline
sk_clf = sk_GS(pipe2, param_grid2,cv=3,scoring='f1_macro',refit=True)
sk_clf.fit(X,Y)
print(sk_clf.best_score_)

#Dask Gridsearch with SelectKbest
dask_clf1 = dask_GS(pipe1, param_grid1,cv=3,scheduler=client,scoring='f1_macro',refit=True)
dask_clf1.fit(X,Y)
print(dask_clf1.best_score_)

#Dask Gridsearch with PCA
dask_clf2 = dask_GS(pipe2, param_grid2,cv=3,scheduler=client,scoring='f1_macro',refit=True)
dask_clf2.fit(X,Y)
print(dask_clf2.best_score_)

This results in several core dump files and the following error:

Results In:
``` 0.18358307544807187 0.16009973534039232 distributed.nanny - WARNING - Restarting worker distributed.nanny - WARNING - Restarting worker distributed.nanny - WARNING - Restarting worker distributed.nanny - WARNING - Restarting worker ('score-f44a4381cb4779b9d45ba2c0ba7c2a72', 15, 1) has failed... retrying --------------------------------------------------------------------------- KeyError Traceback (most recent call last) in 55 #Dask Gridsearch with PCA 56 dask_clf2 = dask_GS(pipe2, param_grid2,cv=3,scheduler=client,scoring='f1_macro',refit=True) ---> 57 dask_clf2.fit(X,Y) 58 print(dask_clf2.best_score_) /opt/conda/envs/py_geo/lib/python3.7/site-packages/dask_ml/model_selection/_search.py in fit(self, X, y, groups, **fit_params) 1255 else: 1256 logger.warning("{} has failed... retrying".format(future.key)) -> 1257 future.retry() 1258 ac.add(future) 1259 /opt/conda/envs/py_geo/lib/python3.7/site-packages/distributed/client.py in retry(self, **kwargs) 307 Client.retry 308 """ --> 309 return self.client.retry([self], **kwargs) 310 311 def cancelled(self): /opt/conda/envs/py_geo/lib/python3.7/site-packages/distributed/client.py in retry(self, futures, asynchronous) 2139 futures: list of Futures 2140 """ -> 2141 return self.sync(self._retry, futures, asynchronous=asynchronous) 2142 2143 @gen.coroutine /opt/conda/envs/py_geo/lib/python3.7/site-packages/distributed/client.py in sync(self, func, asynchronous, callback_timeout, *args, **kwargs) 778 else: 779 return sync( --> 780 self.loop, func, *args, callback_timeout=callback_timeout, **kwargs 781 ) 782 /opt/conda/envs/py_geo/lib/python3.7/site-packages/distributed/utils.py in sync(loop, func, callback_timeout, *args, **kwargs) 346 if error[0]: 347 typ, exc, tb = error[0] --> 348 raise exc.with_traceback(tb) 349 else: 350 return result[0] /opt/conda/envs/py_geo/lib/python3.7/site-packages/distributed/utils.py in f() 330 if callback_timeout is not None: 331 future = asyncio.wait_for(future, callback_timeout) --> 332 result[0] = yield future 333 except Exception as exc: 334 error[0] = sys.exc_info() /opt/conda/envs/py_geo/lib/python3.7/site-packages/tornado/gen.py in run(self) 733 734 try: --> 735 value = future.result() 736 except Exception: 737 exc_info = sys.exc_info() /opt/conda/envs/py_geo/lib/python3.7/site-packages/distributed/client.py in _retry(self, futures) 2128 response = await self.scheduler.retry(keys=keys, client=self.id) 2129 for key in response: -> 2130 st = self.futures[key] 2131 st.retry() 2132 KeyError: "('pca-fit-transform-f44a4381cb4779b9d45ba2c0ba7c2a72', 0, 1)" ```
Dask Distributed worker / scheduler logs
``` {'Scheduler': 'distributed.scheduler - INFO - Clear task state\n' 'distributed.scheduler - INFO - Scheduler at: ' 'tcp://127.0.0.1:41727\n' 'distributed.scheduler - INFO - dashboard at: ' '127.0.0.1:8787\n' 'distributed.scheduler - INFO - Register worker \n" 'distributed.scheduler - INFO - Starting worker compute stream, ' 'tcp://127.0.0.1:35216\n' 'distributed.scheduler - INFO - Register worker \n" 'distributed.scheduler - INFO - Starting worker compute stream, ' 'tcp://127.0.0.1:44716\n' 'distributed.scheduler - INFO - Receive client connection: ' 'Client-ee79b350-6fec-11ea-878c-0cc47a4279e3\n' 'distributed.scheduler - INFO - Remove worker \n" 'distributed.scheduler - INFO - Register worker \n" 'distributed.scheduler - INFO - Starting worker compute stream, ' 'tcp://127.0.0.1:38830\n' 'distributed.scheduler - INFO - Remove worker \n" 'distributed.scheduler - INFO - Register worker \n" 'distributed.scheduler - INFO - Starting worker compute stream, ' 'tcp://127.0.0.1:43574\n' 'distributed.scheduler - INFO - Remove worker \n" 'distributed.scheduler - INFO - Remove worker \n" 'distributed.scheduler - INFO - Lost all workers\n' 'distributed.scheduler - INFO - Client ' 'Client-ee79b350-6fec-11ea-878c-0cc47a4279e3 requests to retry 1 ' 'keys\n' 'distributed.scheduler - INFO - Register worker \n" 'distributed.scheduler - INFO - Starting worker compute stream, ' 'tcp://127.0.0.1:33853\n' 'distributed.scheduler - INFO - Register worker \n" 'distributed.scheduler - INFO - Starting worker compute stream, ' 'tcp://127.0.0.1:33057\n' 'distributed.scheduler - INFO - Remove worker \n" 'distributed.scheduler - INFO - Register worker \n" 'distributed.scheduler - INFO - Starting worker compute stream, ' 'tcp://127.0.0.1:33579\n' 'distributed.scheduler - INFO - Remove worker \n" 'distributed.scheduler - INFO - Register worker \n" 'distributed.scheduler - INFO - Starting worker compute stream, ' 'tcp://127.0.0.1:33304\n' 'distributed.scheduler - INFO - Remove worker \n" 'distributed.scheduler - INFO - Register worker \n" 'distributed.scheduler - INFO - Starting worker compute stream, ' 'tcp://127.0.0.1:40951', 'tcp://127.0.0.1:33304': 'distributed.worker - INFO - Start worker ' 'at: tcp://127.0.0.1:33304\n' 'distributed.worker - INFO - Listening ' 'to: tcp://127.0.0.1:33304\n' 'distributed.worker - INFO - Waiting to connect ' 'to: tcp://127.0.0.1:41727\n' 'distributed.worker - INFO - ' '-------------------------------------------------\n' 'distributed.worker - INFO - ' 'Threads: 2\n' 'distributed.worker - INFO - ' 'Memory: 6.50 GB\n' 'distributed.worker - INFO - Local Directory: ' '/project/cper_neon_aop/neon_temporal/dask-worker-space/worker-nswgxjil\n' 'distributed.worker - INFO - ' '-------------------------------------------------\n' 'distributed.worker - INFO - Registered ' 'to: tcp://127.0.0.1:41727\n' 'distributed.worker - INFO - ' '-------------------------------------------------', 'tcp://127.0.0.1:40951': 'distributed.worker - INFO - Start worker ' 'at: tcp://127.0.0.1:40951\n' 'distributed.worker - INFO - Listening ' 'to: tcp://127.0.0.1:40951\n' 'distributed.worker - INFO - Waiting to connect ' 'to: tcp://127.0.0.1:41727\n' 'distributed.worker - INFO - ' '-------------------------------------------------\n' 'distributed.worker - INFO - ' 'Threads: 2\n' 'distributed.worker - INFO - ' 'Memory: 6.50 GB\n' 'distributed.worker - INFO - Local Directory: ' '/project/cper_neon_aop/neon_temporal/dask-worker-space/worker-31u54x2o\n' 'distributed.worker - INFO - ' '-------------------------------------------------\n' 'distributed.worker - INFO - Registered ' 'to: tcp://127.0.0.1:41727\n' 'distributed.worker - INFO - ' '-------------------------------------------------'} ```
Results of gdb on core dump file:
``` Core was generated by `/opt/conda/envs/py_geo/bin/python -c from multiprocessing.forkserver import mai'. Program terminated with signal 11, Segmentation fault. ```
TomAugspurger commented 4 years ago

Thanks for the copy-pastable example. I'm not able to reproduce locally. Can you reproduce it in a fresh environment?

rmg55 commented 4 years ago

Hi Tom,

Thanks for the quick response! I am working within a singularity container (without write permissions) on an HPC. Here is a link to the image I am using - Dockerfile

Any suggestions on how I might be able to further debug (I struggle when trying debug segmentation faults)? I will try to reproduce in a fresh environment, but thought I would pass along the image in case you would like to try it..

TomAugspurger commented 4 years ago

Thanks. I'm not sure why there would be a segfault, but it likely isn't from Dask. We're just coordinating calls to scikit-learn here.

You might watch the dashboard and see if anything strange happens before the worker dies (perhaps suddenly high memory usage and the job scheduler kills the worker? Some HPC systems don't let you easily spill to disk).

rmg55 commented 4 years ago

Hi @TomAugspurger,

I can confirm that I am able to run the example successfully in a local conda environment. However, I am still having issues running the example in the singularity image (DockerHub Image)

I get the same errors when I try:

singularity exec docker://rowangaffney/data_science_im_rs:latest /opt/conda/envs/py_geo/bin/python min_example.py

This is probably out of scope for dask-ml, but thought I should post my update on the issue. If you have any further ideas/directions on how to debug, that would be great - otherwise, feel free to close and I can try with the singularity project.

TomAugspurger commented 4 years ago

Thanks for the update.

I'm not especially sure where to go next for debugging... You might try with different schedulers

  1. Use the threaded scheduler (by not creating a LocalCluster / Client)
  2. Use the single-threaded scheduler
import dask

dask.config.set(scheduler="single-threaded")

If 1 passes, that tells us there's (maybe) some issue with communication / coordination between processes.

If 1 fails but 2 passes, that tells us there's an issue with using this scikit-learn code from multiple threads.

rmg55 commented 4 years ago

Thanks @TomAugspurger

1 works, but 2 fails (see below). I guess that suggests that there is an issue with the communication / coordination between processes. Seems odd that the SelectKBest works, but the PCA does not...

I am running this in an HPC environment (via SLURM and JupyterHub) within a singularity container. When launching the container, I am bind mounting the following.

--bind /etc/munge --bind /var/log/munge --bind /var/run/munge --bind /usr/bin/gdb \
--bind /usr/bin/squeue --bind /usr/bin/scancel --bind /usr/bin/sbatch \
--bind /usr/bin/scontrol --bind /usr/bin/sinfo --bind /system/slurm:/etc/slurm \
--bind /run/munge --bind /usr/lib64 --bind /scinet01 --bind $HOME \
--bind /software/7/apps/envi -H $HOME:/home/jovyan
Threaded Scheduler:
```python from dask.distributed import Client, LocalCluster from dask_ml.model_selection import GridSearchCV as dask_GS from sklearn.model_selection import GridSearchCV as sk_GS from sklearn.datasets import make_multilabel_classification from sklearn import svm from sklearn.pipeline import Pipeline from sklearn.decomposition import PCA from sklearn.feature_selection import SelectKBest, f_classif import numpy as np import dask #dask.config.set(scheduler="single-threaded") #cluster = LocalCluster(n_workers=5,threads_per_worker=2) #client = Client(cluster) X, Y = make_multilabel_classification(n_classes=4, n_labels=1,n_features=271, n_samples=1200, random_state=1) Y = Y.sum(axis=1) N_FEATURES_OPTIONS_pca = np.arange(10)[1::3].tolist() N_FEATURES_OPTIONS_sel = np.arange(10)[1::3].tolist() Cs = [1,10,100.] gammas = [.001,0.01] pca = PCA(iterated_power='auto') selection = SelectKBest(f_classif) svc = svm.SVC() pipe1 = Pipeline([ ('reduce_dim', selection), ('classify', svc)]) pipe2 = Pipeline([ ('reduce_dim', pca), ('classify', svc)]) param_grid1 = [{'reduce_dim__k': N_FEATURES_OPTIONS_sel, 'classify__C': Cs, 'classify__gamma': gammas}] param_grid2 = [{'reduce_dim__n_components': N_FEATURES_OPTIONS_pca, 'classify__C': Cs, 'classify__gamma': gammas}] #Sklearn Gridsearch with PCA pipeline sk_clf = sk_GS(pipe2, param_grid2,cv=3,scoring='f1_macro',refit=True) sk_clf.fit(X,Y) print(sk_clf.best_score_) #Dask Gridsearch with SelectKbest dask_clf1 = dask_GS(pipe1, param_grid1,cv=3,scoring='f1_macro',refit=True) dask_clf1.fit(X,Y) print(dask_clf1.best_score_) #Dask Gridsearch with PCA dask_clf2 = dask_GS(pipe2, param_grid2,cv=3,scoring='f1_macro',refit=True) dask_clf2.fit(X,Y) print(dask_clf2.best_score_) ``` ``` 0.3867629956246641 0.24710634994123742 0.3881990393401058 ```

However, when I run it with the single threaded scheduler, it fails

Single Threaded Scheduler:
```python from dask.distributed import Client, LocalCluster from dask_ml.model_selection import GridSearchCV as dask_GS from sklearn.model_selection import GridSearchCV as sk_GS from sklearn.datasets import make_multilabel_classification from sklearn import svm from sklearn.pipeline import Pipeline from sklearn.decomposition import PCA from sklearn.feature_selection import SelectKBest, f_classif import numpy as np import dask dask.config.set(scheduler="single-threaded") cluster = LocalCluster(n_workers=5,threads_per_worker=2) client = Client(cluster) X, Y = make_multilabel_classification(n_classes=4, n_labels=1,n_features=271, n_samples=1200, random_state=1) Y = Y.sum(axis=1) N_FEATURES_OPTIONS_pca = np.arange(10)[1::3].tolist() N_FEATURES_OPTIONS_sel = np.arange(10)[1::3].tolist() Cs = [1,10,100.] gammas = [.001,0.01] pca = PCA(iterated_power='auto') selection = SelectKBest(f_classif) svc = svm.SVC() pipe1 = Pipeline([ ('reduce_dim', selection), ('classify', svc)]) pipe2 = Pipeline([ ('reduce_dim', pca), ('classify', svc)]) param_grid1 = [{'reduce_dim__k': N_FEATURES_OPTIONS_sel, 'classify__C': Cs, 'classify__gamma': gammas}] param_grid2 = [{'reduce_dim__n_components': N_FEATURES_OPTIONS_pca, 'classify__C': Cs, 'classify__gamma': gammas}] #Sklearn Gridsearch with PCA pipeline sk_clf = sk_GS(pipe2, param_grid2,cv=3,scoring='f1_macro',refit=True) sk_clf.fit(X,Y) print(sk_clf.best_score_) #Dask Gridsearch with SelectKbest dask_clf1 = dask_GS(pipe1, param_grid1,scheduler=client,cv=3,scoring='f1_macro',refit=True) dask_clf1.fit(X,Y) print(dask_clf1.best_score_) #Dask Gridsearch with PCA dask_clf2 = dask_GS(pipe2, param_grid2,scheduler=client,cv=3,scoring='f1_macro',refit=True) dask_clf2.fit(X,Y) print(dask_clf2.best_score_) ``` ``` 0.387203154424183 0.24710634994123742 distributed.nanny - WARNING - Restarting worker distributed.nanny - WARNING - Restarting worker distributed.nanny - WARNING - Restarting worker ('score-507f5d8af2c70b3bd3c83f7c032fff21', 10, 1) has failed... retrying distributed.nanny - WARNING - Restarting worker --------------------------------------------------------------------------- KeyError Traceback (most recent call last) in 57 #Dask Gridsearch with PCA 58 dask_clf2 = dask_GS(pipe2, param_grid2,scheduler=client,cv=3,scoring='f1_macro',refit=True) ---> 59 dask_clf2.fit(X,Y) 60 print(dask_clf2.best_score_) /opt/conda/envs/py_geo/lib/python3.7/site-packages/dask_ml/model_selection/_search.py in fit(self, X, y, groups, **fit_params) 1255 else: 1256 logger.warning("{} has failed... retrying".format(future.key)) -> 1257 future.retry() 1258 ac.add(future) 1259 /opt/conda/envs/py_geo/lib/python3.7/site-packages/distributed/client.py in retry(self, **kwargs) 305 Client.retry 306 """ --> 307 return self.client.retry([self], **kwargs) 308 309 def cancelled(self): /opt/conda/envs/py_geo/lib/python3.7/site-packages/distributed/client.py in retry(self, futures, asynchronous) 2140 futures: list of Futures 2141 """ -> 2142 return self.sync(self._retry, futures, asynchronous=asynchronous) 2143 2144 @gen.coroutine /opt/conda/envs/py_geo/lib/python3.7/site-packages/distributed/client.py in sync(self, func, asynchronous, callback_timeout, *args, **kwargs) 776 else: 777 return sync( --> 778 self.loop, func, *args, callback_timeout=callback_timeout, **kwargs 779 ) 780 /opt/conda/envs/py_geo/lib/python3.7/site-packages/distributed/utils.py in sync(loop, func, callback_timeout, *args, **kwargs) 346 if error[0]: 347 typ, exc, tb = error[0] --> 348 raise exc.with_traceback(tb) 349 else: 350 return result[0] /opt/conda/envs/py_geo/lib/python3.7/site-packages/distributed/utils.py in f() 330 if callback_timeout is not None: 331 future = asyncio.wait_for(future, callback_timeout) --> 332 result[0] = yield future 333 except Exception as exc: 334 error[0] = sys.exc_info() /opt/conda/envs/py_geo/lib/python3.7/site-packages/tornado/gen.py in run(self) 733 734 try: --> 735 value = future.result() 736 except Exception: 737 exc_info = sys.exc_info() /opt/conda/envs/py_geo/lib/python3.7/site-packages/distributed/client.py in _retry(self, futures) 2129 response = await self.scheduler.retry(keys=keys, client=self.id) 2130 for key in response: -> 2131 st = self.futures[key] 2132 st.retry() 2133 KeyError: "('pca-transform-507f5d8af2c70b3bd3c83f7c032fff21', 1, 1)" ```
TomAugspurger commented 4 years ago

Shot in the dark: can you try disabling spill to disk? https://jobqueue.dask.org/en/latest/configuration-setup.html#no-local-storage

rmg55 commented 4 years ago

hmm still seeing the same issue when I avoid spilling to disk by:

dask.config.set({'distributed.worker.memory.target': False, 'distributed.worker.memory.spill': False})
cluster = LocalCluster(n_workers=5,threads_per_worker=2)
client = Client(cluster)
dask.config.config
results in
``` {'distributed.dashboard.link': '/user/{JUPYTERHUB_USER}/proxy/{port}/status', 'labextension': {'factory': {'module': 'dask_jobqueue', 'class': 'SLURMCluster', 'args': [], 'kwargs': {'project': 'your-project-id'}}}, 'logging': {'distributed': 'info', 'distributed.client': 'warning', 'bokeh': 'critical', 'tornado': 'critical', 'tornado.application': 'error'}, 'require-encryption': False, 'temporary-directory': None, 'dataframe': {'shuffle-compression': None}, 'array': {'svg': {'size': 120}, 'chunk-size': '128MiB', 'rechunk-threshold': 4}, 'distributed': {'version': 2, 'scheduler': {'allowed-failures': 3, 'bandwidth': 100000000, 'blocked-handlers': [], 'default-data-size': 1000, 'events-cleanup-delay': '1h', 'idle-timeout': None, 'transition-log-length': 100000, 'work-stealing': True, 'work-stealing-interval': '100ms', 'worker-ttl': None, 'pickle': True, 'preload': [], 'preload-argv': [], 'default-task-durations': {'rechunk-split': '1us', 'shuffle-split': '1us'}, 'validate': False, 'dashboard': {'status': {'task-stream-length': 1000}, 'tasks': {'task-stream-length': 100000}, 'tls': {'ca-file': None, 'key': None, 'cert': None}}}, 'worker': {'blocked-handlers': [], 'multiprocessing-method': 'forkserver', 'use-file-locking': True, 'connections': {'outgoing': 50, 'incoming': 10}, 'preload': [], 'preload-argv': [], 'daemon': True, 'validate': False, 'lifetime': {'duration': None, 'stagger': '0 seconds', 'restart': False}, 'profile': {'interval': 10, 'cycle': 1000, 'low-level': False}, 'memory': {'target': False, 'spill': False, 'pause': 0.8, 'terminate': 0.95}}, 'client': {'heartbeat': '5s'}, 'deploy': {'lost-worker-timeout': '15s'}, 'adaptive': {'interval': '1s', 'target-duration': '5s', 'minimum': 0, 'maximum': inf, 'wait-count': 3}, 'comm': {'retry': {'count': 0, 'delay': {'min': '1s', 'max': '20s'}}, 'compression': 'auto', 'offload': '10MiB', 'default-scheme': 'tcp', 'socket-backlog': 2048, 'recent-messages-log-length': 0, 'zstd': {'level': 3, 'threads': 0}, 'timeouts': {'connect': 3, 'tcp': 30}, 'require-encryption': False, 'tls': {'ciphers': None, 'ca-file': None, 'scheduler': {'cert': None, 'key': None}, 'worker': {'key': None, 'cert': None}, 'client': {'key': None, 'cert': None}}}, 'dashboard': {'link': '{scheme}://{host}:{port}/status', 'export-tool': False}, 'admin': {'tick': {'interval': 20, 'limit': 1000}, 'max-error-length': 10000, 'log-length': 10000, 'log-format': '%(name)s - %(levelname)s - %(message)s', 'pdb-on-err': False}}, 'rmm': {'pool-size': None}, 'ucx': {'tcp': None, 'nvlink': None, 'infiniband': None, 'cuda_copy': None, 'net-devices': None}, 'scheduler': 'dask.distributed', 'shuffle': 'tasks'} ```

I did see the following in one of the work logs:

cluster.logs()
results in
``` distributed.worker - INFO - Start worker at: tcp://127.0.0.1:46552 distributed.worker - INFO - Listening to: tcp://127.0.0.1:46552 distributed.worker - INFO - Waiting to connect to: tcp://127.0.0.1:39779 distributed.worker - INFO - ------------------------------------------------- distributed.worker - INFO - Threads: 2 distributed.worker - INFO - Memory: 13.00 GB distributed.worker - INFO - Local Directory: /project/cper_neon_aop/neon_temporal/dask-worker-space/worker-_fo15gy1 distributed.worker - INFO - ------------------------------------------------- distributed.worker - INFO - Registered to: tcp://127.0.0.1:39779 distributed.worker - INFO - ------------------------------------------------- distributed.worker - ERROR - Worker stream died during communication: tcp://127.0.0.1:41763 Traceback (most recent call last): File "/opt/conda/envs/py_geo/lib/python3.7/site-packages/distributed/comm/tcp.py", line 188, in read n_frames = await stream.read_bytes(8) tornado.iostream.StreamClosedError: Stream is closed During handling of the above exception, another exception occurred: Traceback (most recent call last): File "/opt/conda/envs/py_geo/lib/python3.7/site-packages/distributed/worker.py", line 1953, in gather_dep self.rpc, deps, worker, who=self.address File "/opt/conda/envs/py_geo/lib/python3.7/site-packages/distributed/worker.py", line 3222, in get_data_from_worker return await retry_operation(_get_data, operation="get_data_from_worker") File "/opt/conda/envs/py_geo/lib/python3.7/site-packages/distributed/utils_comm.py", line 391, in retry_operation operation=operation, File "/opt/conda/envs/py_geo/lib/python3.7/site-packages/distributed/utils_comm.py", line 379, in retry return await coro() File "/opt/conda/envs/py_geo/lib/python3.7/site-packages/distributed/worker.py", line 3209, in _get_data max_connections=max_connections, File "/opt/conda/envs/py_geo/lib/python3.7/site-packages/distributed/core.py", line 541, in send_recv response = await comm.read(deserializers=deserializers) File "/opt/conda/envs/py_geo/lib/python3.7/site-packages/distributed/comm/tcp.py", line 208, in read convert_stream_closed_error(self, e) File "/opt/conda/envs/py_geo/lib/python3.7/site-packages/distributed/comm/tcp.py", line 121, in convert_stream_closed_error raise CommClosedError("in %s: %s: %s" % (obj, exc.__class__.__name__, exc)) distributed.comm.core.CommClosedError: in : ConnectionResetError: [Errno 104] Connection reset by peer distributed.worker - ERROR - Worker stream died during communication: tcp://127.0.0.1:44873 Traceback (most recent call last): File "/opt/conda/envs/py_geo/lib/python3.7/site-packages/distributed/comm/tcp.py", line 188, in read n_frames = await stream.read_bytes(8) tornado.iostream.StreamClosedError: Stream is closed During handling of the above exception, another exception occurred: Traceback (most recent call last): File "/opt/conda/envs/py_geo/lib/python3.7/site-packages/distributed/worker.py", line 1953, in gather_dep self.rpc, deps, worker, who=self.address File "/opt/conda/envs/py_geo/lib/python3.7/site-packages/distributed/worker.py", line 3222, in get_data_from_worker return await retry_operation(_get_data, operation="get_data_from_worker") File "/opt/conda/envs/py_geo/lib/python3.7/site-packages/distributed/utils_comm.py", line 391, in retry_operation operation=operation, File "/opt/conda/envs/py_geo/lib/python3.7/site-packages/distributed/utils_comm.py", line 379, in retry return await coro() File "/opt/conda/envs/py_geo/lib/python3.7/site-packages/distributed/worker.py", line 3209, in _get_data max_connections=max_connections, File "/opt/conda/envs/py_geo/lib/python3.7/site-packages/distributed/core.py", line 541, in send_recv response = await comm.read(deserializers=deserializers) File "/opt/conda/envs/py_geo/lib/python3.7/site-packages/distributed/comm/tcp.py", line 208, in read convert_stream_closed_error(self, e) File "/opt/conda/envs/py_geo/lib/python3.7/site-packages/distributed/comm/tcp.py", line 121, in convert_stream_closed_error raise CommClosedError("in %s: %s: %s" % (obj, exc.__class__.__name__, exc)) distributed.comm.core.CommClosedError: in : ConnectionResetError: [Errno 104] Connection reset by peer distributed.worker - ERROR - Worker stream died during communication: tcp://127.0.0.1:39042 Traceback (most recent call last): File "/opt/conda/envs/py_geo/lib/python3.7/site-packages/distributed/comm/tcp.py", line 188, in read n_frames = await stream.read_bytes(8) tornado.iostream.StreamClosedError: Stream is closed During handling of the above exception, another exception occurred: Traceback (most recent call last): File "/opt/conda/envs/py_geo/lib/python3.7/site-packages/distributed/worker.py", line 1953, in gather_dep self.rpc, deps, worker, who=self.address File "/opt/conda/envs/py_geo/lib/python3.7/site-packages/distributed/worker.py", line 3222, in get_data_from_worker return await retry_operation(_get_data, operation="get_data_from_worker") File "/opt/conda/envs/py_geo/lib/python3.7/site-packages/distributed/utils_comm.py", line 391, in retry_operation operation=operation, File "/opt/conda/envs/py_geo/lib/python3.7/site-packages/distributed/utils_comm.py", line 379, in retry return await coro() File "/opt/conda/envs/py_geo/lib/python3.7/site-packages/distributed/worker.py", line 3209, in _get_data max_connections=max_connections, File "/opt/conda/envs/py_geo/lib/python3.7/site-packages/distributed/core.py", line 541, in send_recv response = await comm.read(deserializers=deserializers) File "/opt/conda/envs/py_geo/lib/python3.7/site-packages/distributed/comm/tcp.py", line 208, in read convert_stream_closed_error(self, e) File "/opt/conda/envs/py_geo/lib/python3.7/site-packages/distributed/comm/tcp.py", line 121, in convert_stream_closed_error raise CommClosedError("in %s: %s: %s" % (obj, exc.__class__.__name__, exc)) distributed.comm.core.CommClosedError: in : ConnectionResetError: [Errno 104] Connection reset by peer distributed.worker - INFO - Can't find dependencies for key ('pca-fit-transform-507f5d8af2c70b3bd3c83f7c032fff21', 1, 0) distributed.worker - INFO - Dependent not found: pca-507f5d8af2c70b3bd3c83f7c032fff21 0 . Asking scheduler distributed.worker - INFO - Can't find dependencies for key ('pca-fit-transform-507f5d8af2c70b3bd3c83f7c032fff21', 0, 0) distributed.worker - INFO - Can't find dependencies for key ('pca-fit-transform-507f5d8af2c70b3bd3c83f7c032fff21', 1, 0) distributed.worker - INFO - Can't find dependencies for key ('pca-fit-transform-507f5d8af2c70b3bd3c83f7c032fff21', 0, 0) distributed.worker - INFO - Can't find dependencies for key ('pca-fit-transform-507f5d8af2c70b3bd3c83f7c032fff21', 1, 0) distributed.worker - INFO - Can't find dependencies for key ('pca-fit-transform-507f5d8af2c70b3bd3c83f7c032fff21', 0, 0) distributed.worker - INFO - Can't find dependencies for key ('pca-fit-transform-507f5d8af2c70b3bd3c83f7c032fff21', 1, 0) distributed.worker - INFO - Can't find dependencies for key ('pca-fit-transform-507f5d8af2c70b3bd3c83f7c032fff21', 0, 0) distributed.worker - INFO - Can't find dependencies for key ('pca-fit-transform-507f5d8af2c70b3bd3c83f7c032fff21', 1, 0) distributed.worker - INFO - Can't find dependencies for key ('pca-fit-transform-507f5d8af2c70b3bd3c83f7c032fff21', 0, 0) distributed.worker - ERROR - Worker stream died during communication: tcp://127.0.0.1:45289 Traceback (most recent call last): File "/opt/conda/envs/py_geo/lib/python3.7/site-packages/distributed/comm/core.py", line 232, in connect _raise(error) File "/opt/conda/envs/py_geo/lib/python3.7/site-packages/distributed/comm/core.py", line 213, in _raise raise IOError(msg) OSError: Timed out trying to connect to 'tcp://127.0.0.1:45289' after 3 s: in : ConnectionRefusedError: [Errno 111] Connection refused During handling of the above exception, another exception occurred: Traceback (most recent call last): File "/opt/conda/envs/py_geo/lib/python3.7/site-packages/distributed/worker.py", line 1953, in gather_dep self.rpc, deps, worker, who=self.address File "/opt/conda/envs/py_geo/lib/python3.7/site-packages/distributed/worker.py", line 3222, in get_data_from_worker return await retry_operation(_get_data, operation="get_data_from_worker") File "/opt/conda/envs/py_geo/lib/python3.7/site-packages/distributed/utils_comm.py", line 391, in retry_operation operation=operation, File "/opt/conda/envs/py_geo/lib/python3.7/site-packages/distributed/utils_comm.py", line 379, in retry return await coro() File "/opt/conda/envs/py_geo/lib/python3.7/site-packages/distributed/worker.py", line 3199, in _get_data comm = await rpc.connect(worker) File "/opt/conda/envs/py_geo/lib/python3.7/site-packages/distributed/core.py", line 908, in connect connection_args=self.connection_args, File "/opt/conda/envs/py_geo/lib/python3.7/site-packages/distributed/comm/core.py", line 243, in connect _raise(error) File "/opt/conda/envs/py_geo/lib/python3.7/site-packages/distributed/comm/core.py", line 213, in _raise raise IOError(msg) OSError: Timed out trying to connect to 'tcp://127.0.0.1:45289' after 3 s: Timed out trying to connect to 'tcp://127.0.0.1:45289' after 3 s: in : ConnectionRefusedError: [Errno 111] Connection refused ```
TomAugspurger commented 4 years ago

Hmm I'm not sure what to try next :/

On Thu, Apr 2, 2020 at 6:24 PM Rowan Gaffney notifications@github.com wrote:

Reopened #629 https://github.com/dask/dask-ml/issues/629.

— You are receiving this because you were mentioned. Reply to this email directly, view it on GitHub https://github.com/dask/dask-ml/issues/629#event-3194268558, or unsubscribe https://github.com/notifications/unsubscribe-auth/AAKAOIX3GEIZ3DWQBAZF3GDRKUNCRANCNFSM4LUYUFEQ .

rmg55 commented 4 years ago

Ok, Thanks for you help @TomAugspurger. I'll close as I kinda think this may be an issue with the singularity container or related to the HPC system.

dtrudg commented 4 years ago

Noting over here from: https://github.com/sylabs/singularity/issues/5259 so there's a pointer in case others come across it here.

It looks like you are binding the entire /usr/lib64 host library directory into the container.

 --bind /usr/lib64 

This will almost certainly cause issues including segfaults, unless the container OS exactly matches the host - because the executables in the container expect to use libraries from the container... not the ones from the host which will be a different version / built differently.

Also - when you run Singularity containers with python apps, python packages installed in your $HOME with pip install --user can interfere. Try --contain to avoid that.

rmg55 commented 4 years ago

I was able to do some more debugging that might help with diagnosing the issue. Using gdb to examine the Seg Faults (via the core dump files) I can get the following back traces. @TomAugspurger, @dctrud, and @ynanyam, any idea if this is an issue within Singularity (aka issues with shared libraries between host of container) or Python library issue? Thanks!

with gdb /opt/conda/env/py_geo/pyhton core.XXXX I get: ``` (py_geo) Singularity> gdb /opt/conda/envs/py_geo/bin/python core.2717 GNU gdb (GDB) 8.3.1 Copyright (C) 2019 Free Software Foundation, Inc. License GPLv3+: GNU GPL version 3 or later This is free software: you are free to change and redistribute it. There is NO WARRANTY, to the extent permitted by law. Type "show copying" and "show warranty" for details. This GDB was configured as "x86_64-pc-linux-gnu". Type "show configuration" for configuration details. For bug reporting instructions, please see: . Find the GDB manual and other documentation resources online at: . For help, type "help". Type "apropos word" to search for commands related to "word"... Reading symbols from /opt/conda/envs/py_geo/bin/python... warning: core file may not match specified executable file. [New LWP 2772] [New LWP 2773] [New LWP 2807] [New LWP 2808] [New LWP 2727] [New LWP 2810] [New LWP 2811] [New LWP 2726] [New LWP 2812] [New LWP 2725] [New LWP 2809] [New LWP 2717] [New LWP 2720] [New LWP 2814] [New LWP 2813] [New LWP 2815] [New LWP 2771] [Thread debugging using libthread_db enabled] Using host libthread_db library "/lib/x86_64-linux-gnu/libthread_db.so.1". Core was generated by `/opt/conda/envs/py_geo/bin/python -c from multiprocessing.forkserver import mai'. Program terminated with signal SIGSEGV, Segmentation fault. #0 0x00002ac26d81b58c in gemm_driver () from /opt/conda/envs/py_geo/lib/python3.7/site-packages/numpy/core/../../../../libcblas.so.3 [Current thread is 1 (Thread 0x2ac278f9d700 (LWP 2772))] ```
If I do a back trace in gdb (bt) I get:
``` (gdb) bt #0 0x00002ac26d81b58c in gemm_driver () from /opt/conda/envs/py_geo/lib/python3.7/site-packages/numpy/core/../../../../libcblas.so.3 #1 0x00002ac26d81b6da in dgemm_thread_nn () from /opt/conda/envs/py_geo/lib/python3.7/site-packages/numpy/core/../../../../libcblas.so.3 #2 0x00002ac26d727e2c in cblas_dgemm () from /opt/conda/envs/py_geo/lib/python3.7/site-packages/numpy/core/../../../../libcblas.so.3 #3 0x00002ac26d4cf03a in DOUBLE_matmul () from /opt/conda/envs/py_geo/lib/python3.7/site-packages/numpy/core/_multiarray_umath.cpython-37m-x86_64-linux-gnu.so #4 0x00002ac26d4dcf2a in PyUFunc_GenericFunction () from /opt/conda/envs/py_geo/lib/python3.7/site-packages/numpy/core/_multiarray_umath.cpython-37m-x86_64-linux-gnu.so #5 0x00002ac26d4dd1b1 in ufunc_generic_call () from /opt/conda/envs/py_geo/lib/python3.7/site-packages/numpy/core/_multiarray_umath.cpython-37m-x86_64-linux-gnu.so #6 0x000055dcccec0e3f in _PyObject_FastCallDict (callable=, args=, nargs=, kwargs=0x0) at /home/conda/feedstock_root/build_artifacts/python_1585001848288/work/Objects/call.c:125 #7 0x000055dcccedf0fe in object_vacall (callable=, vargs=) at /home/conda/feedstock_root/build_artifacts/python_1585001848288/work/Objects/call.c:1202 #8 0x000055dcccedf249 in PyObject_CallFunctionObjArgs (callable=) at /home/conda/feedstock_root/build_artifacts/python_1585001848288/work/Objects/call.c:1267 #9 0x000055dccce093b3 in binary_op1 (v=, w=, op_slot=) at /home/conda/feedstock_root/build_artifacts/python_1585001848288/work/Objects/abstract.c:807 #10 0x000055dccce1dc3c in binary_op (op_name=0x55dcccfe445c "@", op_slot=272, w=, v=) at /home/conda/feedstock_root/build_artifacts/python_1585001848288/work/Objects/abstract.c:836 #11 PyNumber_MatrixMultiply (v=, w=) at /home/conda/feedstock_root/build_artifacts/python_1585001848288/work/Objects/abstract.c:1007 #12 0x000055dcccf7e97c in _PyEval_EvalFrameDefault (f=, throwflag=) at /home/conda/feedstock_root/build_artifacts/python_1585001848288/work/Python/ceval.c:1219 #13 0x000055dcccebf8f9 in _PyEval_EvalCodeWithName (_co=, globals=, locals=, args=, argcount=, kwnames=0x0, kwargs=0x2ac27b984e20, kwcount=, kwstep=1, defs=0x2ac27b09a6e8, defcount=1, kwdefs=0x0, closure=0x0, name='safe_sparse_dot', qualname='safe_sparse_dot') at /home/conda/feedstock_root/build_artifacts/python_1585001848288/work/Python/ceval.c:3930 #14 0x000055dcccf0e985 in _PyFunction_FastCallKeywords (func=, stack=0x2ac27b984e10, nargs=2, kwnames=) at /home/conda/feedstock_root/build_artifacts/python_1585001848288/work/Objects/call.c:433 #15 0x000055dcccf77216 in call_function (kwnames=0x0, oparg=, pp_stack=) at /home/conda/feedstock_root/build_artifacts/python_1585001848288/work/Python/ceval.c:4616 #16 _PyEval_EvalFrameDefault (f=, throwflag=) at /home/conda/feedstock_root/build_artifacts/python_1585001848288/work/Python/ceval.c:3124 #17 0x000055dcccebf8f9 in _PyEval_EvalCodeWithName (_co=, globals=, locals=, args=, argcount=, kwnames=0x0, kwargs=0x2ac290353960, kwcount=, kwstep=1, defs=0x2ac27b09e478, defcount=2, kwdefs=0x0, closure=0x0, name='randomized_range_finder', qualname='randomized_range_finder') at /home/conda/feedstock_root/build_artifacts/python_1585001848288/work/Python/ceval.c:3930 #18 0x000055dcccf0e985 in _PyFunction_FastCallKeywords (func=, stack=0x2ac290353938, nargs=5, kwnames=) at /home/conda/feedstock_root/build_artifacts/python_1585001848288/work/Objects/call.c:433 #19 0x000055dcccf77216 in call_function (kwnames=0x0, oparg=, pp_stack=) at /home/conda/feedstock_root/build_artifacts/python_1585001848288/work/Python/ceval.c:4616 #20 _PyEval_EvalFrameDefault (f=, throwflag=) at /home/conda/feedstock_root/build_artifacts/python_1585001848288/work/Python/ceval.c:3124 #21 0x000055dcccebf8f9 in _PyEval_EvalCodeWithName (_co=, globals=, locals=, args=, argcount=, kwnames=0x2ac27b6c5cc8, kwargs=0x55dcd02dc748, kwcount=, kwstep=1, defs=0x2ac27b09d458, defcount=6, kwdefs=0x0, closure=0x0, name='randomized_svd', qualname='randomized_svd') at /home/conda/feedstock_root/build_artifacts/python_1585001848288/work/Python/ceval.c:3930 #22 0x000055dcccf0e9e7 in _PyFunction_FastCallKeywords (func=, stack=0x55dcd02dc740, nargs=1, kwnames=) at /home/conda/feedstock_root/build_artifacts/python_1585001848288/work/Objects/call.c:433 #23 0x000055dcccf782e7 in call_function (kwnames=('n_components', 'n_iter', 'flip_sign', 'random_state'), oparg=, pp_stack=) at /home/conda/feedstock_root/build_artifacts/python_1585001848288/work/Python/ceval.c:4616 #24 _PyEval_EvalFrameDefault (f=, throwflag=) at /home/conda/feedstock_root/build_artifacts/python_1585001848288/work/Python/ceval.c:3139 #25 0x000055dcccf0e75b in function_code_fastcall (globals=, nargs=4, args=, co=) at /home/conda/feedstock_root/build_artifacts/python_1585001848288/work/Objects/call.c:283 #26 _PyFunction_FastCallKeywords (func=, stack=0x2ac2900015b8, nargs=4, kwnames=) at /home/conda/feedstock_root/build_artifacts/python_1585001848288/work/Objects/call.c:408 #27 0x000055dcccf774a0 in call_function (kwnames=0x0, oparg=, pp_stack=) at /home/conda/feedstock_root/build_artifacts/python_1585001848288/work/Python/ceval.c:4616 #28 _PyEval_EvalFrameDefault (f=, throwflag=) at /home/conda/feedstock_root/build_artifacts/python_1585001848288/work/Python/ceval.c:3110 #29 0x000055dcccf0e75b in function_code_fastcall (globals=, nargs=2, args=, co=) at /home/conda/feedstock_root/build_artifacts/python_1585001848288/work/Objects/call.c:283 #30 _PyFunction_FastCallKeywords (func=, stack=0x55dcd02952e0, nargs=2, kwnames=) at /home/conda/feedstock_root/build_artifacts/python_1585001848288/work/Objects/call.c:408 #31 0x000055dcccf774a0 in call_function (kwnames=0x0, oparg=, pp_stack=) at /home/conda/feedstock_root/build_artifacts/python_1585001848288/work/Python/ceval.c:4616 #32 _PyEval_EvalFrameDefault (f=, throwflag=) at /home/conda/feedstock_root/build_artifacts/python_1585001848288/work/Python/ceval.c:3110 #33 0x000055dcccebf8f9 in _PyEval_EvalCodeWithName (_co=, globals=, locals=, args=, argcount=, kwnames=0x0, kwargs=0x0, kwcount=, kwstep=2, defs=0x2ac27b6c7968, defcount=1, kwdefs=0x0, closure=0x0, name='fit_transform', qualname='PCA.fit_transform') at /home/conda/feedstock_root/build_artifacts/python_1585001848288/work/Python/ceval.c:3930 #34 0x000055dcccec0a35 in _PyFunction_FastCallDict (func=, args=0x2ac278f9bf50, nargs=3, kwargs=) at /home/conda/feedstock_root/build_artifacts/python_1585001848288/work/Objects/call.c:376 --Type for more, q to quit, c to continue without paging-- #35 0x000055dcccedee03 in _PyObject_Call_Prepend (callable=, obj=, args=(, ), kwargs={}) at /home/conda/feedstock_root/build_artifacts/python_1585001848288/work/Objects/call.c:908 #36 0x000055dccced175e in PyObject_Call (callable=, args=, kwargs=) at /home/conda/feedstock_root/build_artifacts/python_1585001848288/work/Objects/call.c:245 #37 0x000055dcccf78d6a in do_call_core (kwdict={}, callargs=(, ), func=) at /home/conda/feedstock_root/build_artifacts/python_1585001848288/work/Python/ceval.c:4645 #38 _PyEval_EvalFrameDefault (f=, throwflag=) at /home/conda/feedstock_root/build_artifacts/python_1585001848288/work/Python/ceval.c:3191 #39 0x000055dcccebf8f9 in _PyEval_EvalCodeWithName (_co=, globals=, locals=, args=, argcount=, kwnames=0x0, kwargs=0x0, kwcount=, kwstep=2, defs=0x2ac29d66d0c8, defcount=4, kwdefs=0x0, closure=0x0, name='fit_transform', qualname='fit_transform') at /home/conda/feedstock_root/build_artifacts/python_1585001848288/work/Python/ceval.c:3930 #40 0x000055dcccec0a35 in _PyFunction_FastCallDict (func=, args=0x2ac27b726cc8, nargs=7, kwargs=) at /home/conda/feedstock_root/build_artifacts/python_1585001848288/work/Objects/call.c:376 #41 0x000055dcccf78d6a in do_call_core (kwdict=0x0, callargs=(, iterated_power='auto', random_state=None) at remote 0x2ac278b6b250>, , , 'raise', ['n_components'], (90,), None), func=) at /home/conda/feedstock_root/build_artifacts/python_1585001848288/work/Python/ceval.c:4645 #42 _PyEval_EvalFrameDefault (f=, throwflag=) at /home/conda/feedstock_root/build_artifacts/python_1585001848288/work/Python/ceval.c:3191 #43 0x000055dcccec096b in function_code_fastcall (globals=, nargs=1, args=, co=0x2ac2780a19c0) at /home/conda/feedstock_root/build_artifacts/python_1585001848288/work/Objects/call.c:283 #44 _PyFunction_FastCallDict (func=, args=0x2ac278b5d728, nargs=1, kwargs=) at /home/conda/feedstock_root/build_artifacts/python_1585001848288/work/Objects/call.c:322 #45 0x000055dcccf78d6a in do_call_core (kwdict={}, callargs=((, , iterated_power='auto', random_state=None) at remote 0x2ac278b6b250>, (, , ), (, ), (, )], pairwise=False, cache={(0, True, True): , (0, False, True): , (1, True, True): , (1, False, True): }, num_train_samples=1200) at remote 0x2ac27b73e0d0>, , , True, True, 1), (, <...>, ) at /home/conda/feedstock_root/build_artifacts/python_1585001848288/work/Python/ceval.c:4645 #46 _PyEval_EvalFrameDefault (f=, throwflag=) at /home/conda/feedstock_root/build_artifacts/python_1585001848288/work/Python/ceval.c:3191 #47 0x000055dcccec096b in function_code_fastcall (globals=, nargs=8, args=, co=0x2ac2780a1d20) at /home/conda/feedstock_root/build_artifacts/python_1585001848288/work/Objects/call.c:283 #48 _PyFunction_FastCallDict (func=, args=0x2ac29d6814e8, nargs=8, kwargs=) at /home/conda/feedstock_root/build_artifacts/python_1585001848288/work/Objects/call.c:322 #49 0x000055dcccf78d6a in do_call_core (kwdict={}, callargs=(, ((, , iterated_power='auto', random_state=None) at remote 0x2ac278b6b250>, (, , ), (, ), (, )], pairwise=False, cache={(0, True, True): , (0, False, True): , (1, True, True): , (1, False, True): }, num_train_samples=1200) at remote 0x2ac27b73e0d0>, , , True, True, 1), (, <...>, ...(truncated), func=) at /home/conda/feedstock_root/build_artifacts/python_1585001848288/work/Python/ceval.c:4645 #50 _PyEval_EvalFrameDefault (f=, throwflag=) at /home/conda/feedstock_root/build_artifacts/python_1585001848288/work/Python/ceval.c:3191 #51 0x000055dcccf0e75b in function_code_fastcall (globals=, nargs=1, args=, co=) at /home/conda/feedstock_root/build_artifacts/python_1585001848288/work/Objects/call.c:283 #52 _PyFunction_FastCallKeywords (func=, stack=0x2ac27b72d5e8, nargs=1, kwnames=) at /home/conda/feedstock_root/build_artifacts/python_1585001848288/work/Objects/call.c:408 #53 0x000055dcccf774a0 in call_function (kwnames=0x0, oparg=, pp_stack=) at /home/conda/feedstock_root/build_artifacts/python_1585001848288/work/Python/ceval.c:4616 #54 _PyEval_EvalFrameDefault (f=, throwflag=) at /home/conda/feedstock_root/build_artifacts/python_1585001848288/work/Python/ceval.c:3110 #55 0x000055dcccec096b in function_code_fastcall (globals=, nargs=2, args=, co=0x2ac277d266f0) at /home/conda/feedstock_root/build_artifacts/python_1585001848288/work/Objects/call.c:283 #56 _PyFunction_FastCallDict (func=, args=0x2ac278940f18, nargs=2, kwargs=) at /home/conda/feedstock_root/build_artifacts/python_1585001848288/work/Objects/call.c:322 #57 0x000055dcccf78d6a in do_call_core (kwdict={}, callargs=(, mutex=<_thread.lock at remote 0x2ac2781f6fc0>, not_empty=, acquire=, release=, _waiters=) at remote 0x2ac278461f90>, not_full=, acquire=, release=, _waiters=) at remote 0x2ac278461410>, all_tasks_done=, acquire=, release=) at /home/conda/feedstock_root/build_artifacts/python_1585001848288/work/Python/ceval.c:4645 #58 _PyEval_EvalFrameDefault (f=, throwflag=) at /home/conda/feedstock_root/build_artifacts/python_1585001848288/work/Python/ceval.c:3191 --Type for more, q to quit, c to continue without paging-- #59 0x000055dcccf0e75b in function_code_fastcall (globals=, nargs=1, args=, co=) at /home/conda/feedstock_root/build_artifacts/python_1585001848288/work/Objects/call.c:283 #60 _PyFunction_FastCallKeywords (func=, stack=0x2ac290000cd0, nargs=1, kwnames=) at /home/conda/feedstock_root/build_artifacts/python_1585001848288/work/Objects/call.c:408 #61 0x000055dcccf774a0 in call_function (kwnames=0x0, oparg=, pp_stack=) at /home/conda/feedstock_root/build_artifacts/python_1585001848288/work/Python/ceval.c:4616 #62 _PyEval_EvalFrameDefault (f=, throwflag=) at /home/conda/feedstock_root/build_artifacts/python_1585001848288/work/Python/ceval.c:3110 #63 0x000055dcccf0e75b in function_code_fastcall (globals=, nargs=1, args=, co=) at /home/conda/feedstock_root/build_artifacts/python_1585001848288/work/Objects/call.c:283 #64 _PyFunction_FastCallKeywords (func=, stack=0x2ac27b5c9ad8, nargs=1, kwnames=) at /home/conda/feedstock_root/build_artifacts/python_1585001848288/work/Objects/call.c:408 #65 0x000055dcccf774a0 in call_function (kwnames=0x0, oparg=, pp_stack=) at /home/conda/feedstock_root/build_artifacts/python_1585001848288/work/Python/ceval.c:4616 #66 _PyEval_EvalFrameDefault (f=, throwflag=) at /home/conda/feedstock_root/build_artifacts/python_1585001848288/work/Python/ceval.c:3110 #67 0x000055dcccec096b in function_code_fastcall (globals=, nargs=1, args=, co=0x2ac26c17a390) at /home/conda/feedstock_root/build_artifacts/python_1585001848288/work/Objects/call.c:283 #68 _PyFunction_FastCallDict (func=, args=0x2ac278f9ce00, nargs=1, kwargs=) at /home/conda/feedstock_root/build_artifacts/python_1585001848288/work/Objects/call.c:322 #69 0x000055dcccedee03 in _PyObject_Call_Prepend (callable=, obj=, args=(), kwargs=0x0) at /home/conda/feedstock_root/build_artifacts/python_1585001848288/work/Objects/call.c:908 #70 0x000055dccced175e in PyObject_Call (callable=, args=, kwargs=) at /home/conda/feedstock_root/build_artifacts/python_1585001848288/work/Objects/call.c:245 #71 0x000055dcccfcf6a7 in t_bootstrap (boot_raw=0x2ac27894f2a0) at /home/conda/feedstock_root/build_artifacts/python_1585001848288/work/Modules/_threadmodule.c:994 #72 0x000055dcccf8a418 in pythread_wrapper (arg=) at /home/conda/feedstock_root/build_artifacts/python_1585001848288/work/Python/thread_pthread.h:174 #73 0x00002ac26add36db in start_thread () from /lib/x86_64-linux-gnu/libpthread.so.0 #74 0x00002ac26b10c88f in clone () from /lib/x86_64-linux-gnu/libc.so.6 ```
If I do a back trace in gdb (py-bt) I get:
``` (gdb) py-bt Traceback (most recent call first): File "/opt/conda/envs/py_geo/lib/python3.7/site-packages/sklearn/utils/extmath.py", line 151, in safe_sparse_dot ret = a @ b File "/opt/conda/envs/py_geo/lib/python3.7/site-packages/sklearn/utils/extmath.py", line 231, in randomized_range_finder Q, _ = linalg.lu(safe_sparse_dot(A, Q), permute_l=True) File "/opt/conda/envs/py_geo/lib/python3.7/site-packages/sklearn/utils/extmath.py", line 346, in randomized_svd power_iteration_normalizer, random_state) (frame information optimized out) (frame information optimized out) File "/opt/conda/envs/py_geo/lib/python3.7/site-packages/sklearn/decomposition/_pca.py", line 376, in fit_transform U, S, V = self._fit(X) File "/opt/conda/envs/py_geo/lib/python3.7/site-packages/dask_ml/model_selection/methods.py", line 260, in fit_transform Xt = est.fit_transform(X, y, **fit_params) File "/opt/conda/envs/py_geo/lib/python3.7/site-packages/distributed/worker.py", line 3301, in execute_task return func(*map(execute_task, args)) File "/opt/conda/envs/py_geo/lib/python3.7/site-packages/distributed/worker.py", line 3408, in apply_function result = function(*args, **kwargs) (frame information optimized out) File "/opt/conda/envs/py_geo/lib/python3.7/site-packages/distributed/threadpoolexecutor.py", line 55, in _worker task.run() (frame information optimized out) (frame information optimized out) File "/opt/conda/envs/py_geo/lib/python3.7/threading.py", line 890, in _bootstrap self._bootstrap_inner() ```
TomAugspurger commented 4 years ago

Thanks for the additional debugging, but it unfortunately doesn't give me any new guesses :/