coiled / feedback

A place to provide Coiled feedback
14 stars 3 forks source link

502 Bad Gateway #99

Closed mrocklin closed 1 year ago

mrocklin commented 3 years ago
In [26]: cluster = coiled.Cluster(n_workers=100, software="dev")
Creating Cluster. This takes about a minute ...Checking environment images
Valid environment image found
---------------------------------------------------------------------------
Exception                                 Traceback (most recent call last)
<ipython-input-26-a48b1722f86c> in <module>
----> 1 cluster = coiled.Cluster(n_workers=100, software="dev")

~/miniconda/lib/python3.8/site-packages/coiled/cluster.py in __init__(self, n_workers, configuration, software, worker_cpu, worker_gpu, worker_memory, worker_class, worker_options, scheduler_cpu, scheduler_memory, scheduler_class, scheduler_options, name, asynchronous, cloud, account, shutdown_on_close, backend_options, credentials)
    151         self._name = "coiled.Cluster"  # Used in Dask's Cluster._ipython_display_
    152         if not self.asynchronous:
--> 153             self.sync(self._start)
    154 
    155     @property

~/workspace/distributed/distributed/deploy/cluster.py in sync(self, func, asynchronous, callback_timeout, *args, **kwargs)
    181             return future
    182         else:
--> 183             return sync(self.loop, func, *args, **kwargs)
    184 
    185     def _log(self, log):

~/workspace/distributed/distributed/utils.py in sync(loop, func, callback_timeout, *args, **kwargs)
    338     if error[0]:
    339         typ, exc, tb = error[0]
--> 340         raise exc.with_traceback(tb)
    341     else:
    342         return result[0]

~/workspace/distributed/distributed/utils.py in f()
    322             if callback_timeout is not None:
    323                 future = asyncio.wait_for(future, callback_timeout)
--> 324             result[0] = yield future
    325         except Exception as exc:
    326             error[0] = sys.exc_info()

~/miniconda/lib/python3.8/site-packages/tornado/gen.py in run(self)
    760 
    761                     try:
--> 762                         value = future.result()
    763                     except Exception:
    764                         exc_info = sys.exc_info()

~/miniconda/lib/python3.8/site-packages/coiled/cluster.py in _start(self)
    210             )
    211             if self._start_n_workers:
--> 212                 await self._scale(self._start_n_workers)
    213 
    214         self.security, info = await self.cloud.security(

~/miniconda/lib/python3.8/site-packages/coiled/cluster.py in _scale(self, n)
    329 
    330     async def _scale(self, n: int) -> Tuple[List[Dict], List[Dict]]:
--> 331         return await self.cloud.scale(
    332             account=self.account,
    333             cluster_id=self.cluster_id,  # type: ignore

~/miniconda/lib/python3.8/site-packages/coiled/core.py in _scale(self, cluster_id, n, account)
    499         if response.status >= 400:
    500             text = await response.text()
--> 501             raise Exception(text)
    502 
    503     def scale(self, cluster_id: int, n: int, account: str = None) -> None:

Exception: <html>
<head><title>502 Bad Gateway</title></head>
<body>
<center><h1>502 Bad Gateway</h1></center>
</body>
</html>
mrocklin commented 3 years ago

This is still going on apparently

import coiled
cluster = coiled.Cluster(n_workers=100, software="mrocklin/dev")
Creating Cluster. This takes about a minute ...Checking environment images
Valid environment image found
---------------------------------------------------------------------------
Exception                                 Traceback (most recent call last)
<ipython-input-14-dea5e55efa35> in <module>
----> 1 cluster = coiled.Cluster(n_workers=100, software="dev")
      2 
      3 from dask.distributed import Client, performance_report
      4 client = Client(cluster)

~/miniconda/lib/python3.8/site-packages/coiled/cluster.py in __init__(self, n_workers, configuration, software, worker_cpu, worker_gpu, worker_memory, worker_class, worker_options, scheduler_cpu, scheduler_memory, scheduler_class, scheduler_options, name, asynchronous, cloud, account, shutdown_on_close, backend_options, credentials)
    151         self._name = "coiled.Cluster"  # Used in Dask's Cluster._ipython_display_
    152         if not self.asynchronous:
--> 153             self.sync(self._start)
    154 
    155     @property

~/workspace/distributed/distributed/deploy/cluster.py in sync(self, func, asynchronous, callback_timeout, *args, **kwargs)
    181             return future
    182         else:
--> 183             return sync(self.loop, func, *args, **kwargs)
    184 
    185     def _log(self, log):

~/workspace/distributed/distributed/utils.py in sync(loop, func, callback_timeout, *args, **kwargs)
    338     if error[0]:
    339         typ, exc, tb = error[0]
--> 340         raise exc.with_traceback(tb)
    341     else:
    342         return result[0]

~/workspace/distributed/distributed/utils.py in f()
    322             if callback_timeout is not None:
    323                 future = asyncio.wait_for(future, callback_timeout)
--> 324             result[0] = yield future
    325         except Exception as exc:
    326             error[0] = sys.exc_info()

~/miniconda/lib/python3.8/site-packages/tornado/gen.py in run(self)
    760 
    761                     try:
--> 762                         value = future.result()
    763                     except Exception:
    764                         exc_info = sys.exc_info()

~/miniconda/lib/python3.8/site-packages/coiled/cluster.py in _start(self)
    210             )
    211             if self._start_n_workers:
--> 212                 await self._scale(self._start_n_workers)
    213 
    214         self.security, info = await self.cloud.security(

~/miniconda/lib/python3.8/site-packages/coiled/cluster.py in _scale(self, n)
    329 
    330     async def _scale(self, n: int) -> Tuple[List[Dict], List[Dict]]:
--> 331         return await self.cloud.scale(
    332             account=self.account,
    333             cluster_id=self.cluster_id,  # type: ignore

~/miniconda/lib/python3.8/site-packages/coiled/core.py in _scale(self, cluster_id, n, account)
    506         if response.status >= 400:
    507             text = await response.text()
--> 508             raise Exception(text)
    509 
    510     def scale(self, cluster_id: int, n: int, account: str = None) -> None:

Exception: 
<!doctype html>
<html lang="en">
<head>
  <title>Server Error (500)</title>
</head>
<body>
  <h1>Server Error (500)</h1><p></p>
</body>
</html>
mrocklin commented 3 years ago

Anecdotally this appears to be more of an issue with scaling than with starting

In [1]: import coiled

In [2]: cluster = coiled.Cluster(n_workers=5, software="dev")
Creating Cluster. This takes about a minute ...Checking environment images
Valid environment image found

In [3]: cluster.scale(100)
---------------------------------------------------------------------------
Exception                                 Traceback (most recent call last)
<ipython-input-3-97dd88993125> in <module>
----> 1 cluster.scale(100)

~/miniconda/lib/python3.8/site-packages/coiled/cluster.py in scale(self, n)
    343             Number of workers to scale cluster size to.
    344         """
--> 345         return self.sync(self._scale, n=n)
    346 
    347     def __enter__(self):

~/workspace/distributed/distributed/deploy/cluster.py in sync(self, func, asynchronous, callback_timeout, *args, **kwargs)
    181             return future
    182         else:
--> 183             return sync(self.loop, func, *args, **kwargs)
    184 
    185     def _log(self, log):

~/workspace/distributed/distributed/utils.py in sync(loop, func, callback_timeout, *args, **kwargs)
    338     if error[0]:
    339         typ, exc, tb = error[0]
--> 340         raise exc.with_traceback(tb)
    341     else:
    342         return result[0]

~/workspace/distributed/distributed/utils.py in f()
    322             if callback_timeout is not None:
    323                 future = asyncio.wait_for(future, callback_timeout)
--> 324             result[0] = yield future
    325         except Exception as exc:
    326             error[0] = sys.exc_info()

~/miniconda/lib/python3.8/site-packages/tornado/gen.py in run(self)
    760 
    761                     try:
--> 762                         value = future.result()
    763                     except Exception:
    764                         exc_info = sys.exc_info()

~/miniconda/lib/python3.8/site-packages/coiled/cluster.py in _scale(self, n)
    329 
    330     async def _scale(self, n: int) -> Tuple[List[Dict], List[Dict]]:
--> 331         return await self.cloud.scale(
    332             account=self.account,
    333             cluster_id=self.cluster_id,  # type: ignore

~/miniconda/lib/python3.8/site-packages/coiled/core.py in _scale(self, cluster_id, n, account)
    506         if response.status >= 400:
    507             text = await response.text()
--> 508             raise Exception(text)
    509 
    510     def scale(self, cluster_id: int, n: int, account: str = None) -> None:

Exception: <html>
<head><title>502 Bad Gateway</title></head>
<body>
<center><h1>502 Bad Gateway</h1></center>
</body>
</html>
shughes-uk commented 1 year ago

We made the client pretty robust to 502s!