coiled / feedback

A place to provide Coiled feedback
14 stars 3 forks source link

Timed out trying to connect to websocket cluster #150

Closed mrocklin closed 1 year ago

mrocklin commented 3 years ago

I suspect that this isn't yet fully released, hence the issue, but this is failing in prod

import coiled
cluster = coiled.Cluster(protocol="wss")
OSError                                   Traceback (most recent call last)
<ipython-input-4-4414c0a82aa2> in <module>
----> 1 cluster = coiled.Cluster(protocol="wss", name="mrocklin-4c652147-f")

~/miniconda/lib/python3.8/site-packages/coiled/cluster.py in __init__(self, name, n_workers, configuration, software, worker_cpu, worker_gpu, worker_memory, worker_class, worker_options, scheduler_cpu, scheduler_memory, scheduler_class, scheduler_options, asynchronous, cloud, account, shutdown_on_close, backend_options, credentials, timeout, environ, protocol)
    351 
    352         if not self.asynchronous:
--> 353             self.sync(self._start)
    354 
    355     @property

~/miniconda/lib/python3.8/site-packages/coiled/cluster.py in sync(self, func, asynchronous, callback_timeout, *args, **kwargs)
    387         **kwargs,
    388     ) -> Union[_T, Awaitable[_T]]:
--> 389         return super().sync(
    390             func,
    391             *args,

~/workspace/distributed/distributed/deploy/cluster.py in sync(self, func, asynchronous, callback_timeout, *args, **kwargs)
    183             return future
    184         else:
--> 185             return sync(self.loop, func, *args, **kwargs)
    186 
    187     def _log(self, log):

~/workspace/distributed/distributed/utils.py in sync(loop, func, callback_timeout, *args, **kwargs)
    324     if error[0]:
    325         typ, exc, tb = error[0]
--> 326         raise exc.with_traceback(tb)
    327     else:
    328         return result[0]

~/workspace/distributed/distributed/utils.py in f()
    307             if callback_timeout is not None:
    308                 future = asyncio.wait_for(future, callback_timeout)
--> 309             result[0] = yield future
    310         except Exception:
    311             error[0] = sys.exc_info()

~/workspace/tornado/tornado/gen.py in run(self)
    764 
    765                     try:
--> 766                         value = future.result()
    767                     except Exception:
    768                         exc_info = sys.exc_info()

~/miniconda/lib/python3.8/site-packages/coiled/context.py in wrapper(*args, **kwargs)
     36     async def wrapper(*args, **kwargs):
     37         with operation_context(f"{func.__module__}.{func.__name__}"):
---> 38             return await func(*args, **kwargs)
     39 
     40     return wrapper

~/miniconda/lib/python3.8/site-packages/coiled/cluster.py in _start(self)
    487                 raise
    488 
--> 489             await super()._start()
    490 
    491             # TODO: Come up with a better long-term solution. Below we raise an informative error message

~/workspace/distributed/distributed/deploy/cluster.py in _start(self)
     64 
     65     async def _start(self):
---> 66         comm = await self.scheduler_comm.live_comm()
     67         await comm.write({"op": "subscribe_worker_status"})
     68         self.scheduler_info = SchedulerInfo(await comm.read())

~/workspace/distributed/distributed/core.py in live_comm(self)
    751             del self.comms[s]
    752         if not open or comm.closed():
--> 753             comm = await connect(
    754                 self.address,
    755                 self.timeout,

~/workspace/distributed/distributed/comm/core.py in connect(addr, timeout, deserialize, handshake_overrides, **connection_args)
    308             await asyncio.sleep(backoff)
    309     else:
--> 310         raise OSError(
    311             f"Timed out trying to connect to {addr} after {timeout} s"
    312         ) from active_exception

OSError: Timed out trying to connect to wss://cloud.coiled.io/cluster/26550/ after 5 s
mrocklin commented 3 years ago

Also fails in staging. The cluster does seem to come up though.

ian-r-rose commented 3 years ago

I can also reproduce this. A week or so ago this worked for me, so I suspect a regression.

shughes-uk commented 1 year ago

protocol support deprecated