coiled / feedback

A place to provide Coiled feedback
14 stars 3 forks source link

502 error when starting cluster on GCP #114

Closed mrocklin closed 3 years ago

mrocklin commented 3 years ago

This is on sandbox. I'm trying to make a cluster on GCP and this particular time it failed. (although it has worked before)

In [8]: time cluster = coiled.Cluster(software="play", n_workers=5)
Creating Cluster. This takes about a minute ...Checking environment images
Valid environment image found
    ---------------------------------------------------------------------------
Exception                                 Traceback (most recent call last)
<timed exec> in <module>

~/miniconda/lib/python3.8/site-packages/coiled/cluster.py in __init__(self, n_workers, configuration, software, worker_cpu, worker_gpu, worker_memory, worker_class, worker_options, scheduler_cpu, scheduler_memory, scheduler_class, scheduler_options, name, asynchronous, cloud, account, shutdown_on_close, backend_options, credentials)
    152         self._name = "coiled.Cluster"  # Used in Dask's Cluster._ipython_display_
    153         if not self.asynchronous:
--> 154             self.sync(self._start)
    155 
    156     @property

~/workspace/distributed/distributed/deploy/cluster.py in sync(self, func, asynchronous, callback_timeout, *args, **kwargs)
    187             return future
    188         else:
--> 189             return sync(self.loop, func, *args, **kwargs)
    190 
    191     def _log(self, log):

~/workspace/distributed/distributed/utils.py in sync(loop, func, callback_timeout, *args, **kwargs)
    338     if error[0]:
    339         typ, exc, tb = error[0]
--> 340         raise exc.with_traceback(tb)
    341     else:
    342         return result[0]

~/workspace/distributed/distributed/utils.py in f()
    322             if callback_timeout is not None:
    323                 future = asyncio.wait_for(future, callback_timeout)
--> 324             result[0] = yield future
    325         except Exception as exc:
    326             error[0] = sys.exc_info()

~/workspace/tornado/tornado/gen.py in run(self)
    764 
    765                     try:
--> 766                         value = future.result()
    767                     except Exception:
    768                         exc_info = sys.exc_info()

~/miniconda/lib/python3.8/site-packages/coiled/cluster.py in _start(self)
    215             )
    216             if self._start_n_workers:
--> 217                 await self._scale(self._start_n_workers)
    218 
    219         self.security, info = await self.cloud.security(

~/miniconda/lib/python3.8/site-packages/coiled/cluster.py in _scale(self, n)
    334 
    335     async def _scale(self, n: int) -> Tuple[List[Dict], List[Dict]]:
--> 336         return await self.cloud.scale(
    337             account=self.account,
    338             cluster_id=self.cluster_id,  # type: ignore

~/miniconda/lib/python3.8/site-packages/coiled/core.py in _scale(self, cluster_id, n, account)
    509         )
    510         if response.status >= 400:
--> 511             await handle_api_exception(response)
    512 
    513     def scale(self, cluster_id: int, n: int, account: str = None) -> None:

~/miniconda/lib/python3.8/site-packages/coiled/utils.py in handle_api_exception(response, exception_cls)
     73         # Response contains no text/body, let's not raise an empty exception
     74         error_text = f"{response.status} - {response.reason}"
---> 75     raise Exception(error_text)
     76 
     77 

Exception: <html>
<head><title>502 Bad Gateway</title></head>
<body>
<center><h1>502 Bad Gateway</h1></center>
</body>
</html>