Closed mrocklin closed 1 year ago
This is still going on apparently
import coiled
cluster = coiled.Cluster(n_workers=100, software="mrocklin/dev")
Creating Cluster. This takes about a minute ...Checking environment images
Valid environment image found
---------------------------------------------------------------------------
Exception Traceback (most recent call last)
<ipython-input-14-dea5e55efa35> in <module>
----> 1 cluster = coiled.Cluster(n_workers=100, software="dev")
2
3 from dask.distributed import Client, performance_report
4 client = Client(cluster)
~/miniconda/lib/python3.8/site-packages/coiled/cluster.py in __init__(self, n_workers, configuration, software, worker_cpu, worker_gpu, worker_memory, worker_class, worker_options, scheduler_cpu, scheduler_memory, scheduler_class, scheduler_options, name, asynchronous, cloud, account, shutdown_on_close, backend_options, credentials)
151 self._name = "coiled.Cluster" # Used in Dask's Cluster._ipython_display_
152 if not self.asynchronous:
--> 153 self.sync(self._start)
154
155 @property
~/workspace/distributed/distributed/deploy/cluster.py in sync(self, func, asynchronous, callback_timeout, *args, **kwargs)
181 return future
182 else:
--> 183 return sync(self.loop, func, *args, **kwargs)
184
185 def _log(self, log):
~/workspace/distributed/distributed/utils.py in sync(loop, func, callback_timeout, *args, **kwargs)
338 if error[0]:
339 typ, exc, tb = error[0]
--> 340 raise exc.with_traceback(tb)
341 else:
342 return result[0]
~/workspace/distributed/distributed/utils.py in f()
322 if callback_timeout is not None:
323 future = asyncio.wait_for(future, callback_timeout)
--> 324 result[0] = yield future
325 except Exception as exc:
326 error[0] = sys.exc_info()
~/miniconda/lib/python3.8/site-packages/tornado/gen.py in run(self)
760
761 try:
--> 762 value = future.result()
763 except Exception:
764 exc_info = sys.exc_info()
~/miniconda/lib/python3.8/site-packages/coiled/cluster.py in _start(self)
210 )
211 if self._start_n_workers:
--> 212 await self._scale(self._start_n_workers)
213
214 self.security, info = await self.cloud.security(
~/miniconda/lib/python3.8/site-packages/coiled/cluster.py in _scale(self, n)
329
330 async def _scale(self, n: int) -> Tuple[List[Dict], List[Dict]]:
--> 331 return await self.cloud.scale(
332 account=self.account,
333 cluster_id=self.cluster_id, # type: ignore
~/miniconda/lib/python3.8/site-packages/coiled/core.py in _scale(self, cluster_id, n, account)
506 if response.status >= 400:
507 text = await response.text()
--> 508 raise Exception(text)
509
510 def scale(self, cluster_id: int, n: int, account: str = None) -> None:
Exception:
<!doctype html>
<html lang="en">
<head>
<title>Server Error (500)</title>
</head>
<body>
<h1>Server Error (500)</h1><p></p>
</body>
</html>
Anecdotally this appears to be more of an issue with scaling than with starting
In [1]: import coiled
In [2]: cluster = coiled.Cluster(n_workers=5, software="dev")
Creating Cluster. This takes about a minute ...Checking environment images
Valid environment image found
In [3]: cluster.scale(100)
---------------------------------------------------------------------------
Exception Traceback (most recent call last)
<ipython-input-3-97dd88993125> in <module>
----> 1 cluster.scale(100)
~/miniconda/lib/python3.8/site-packages/coiled/cluster.py in scale(self, n)
343 Number of workers to scale cluster size to.
344 """
--> 345 return self.sync(self._scale, n=n)
346
347 def __enter__(self):
~/workspace/distributed/distributed/deploy/cluster.py in sync(self, func, asynchronous, callback_timeout, *args, **kwargs)
181 return future
182 else:
--> 183 return sync(self.loop, func, *args, **kwargs)
184
185 def _log(self, log):
~/workspace/distributed/distributed/utils.py in sync(loop, func, callback_timeout, *args, **kwargs)
338 if error[0]:
339 typ, exc, tb = error[0]
--> 340 raise exc.with_traceback(tb)
341 else:
342 return result[0]
~/workspace/distributed/distributed/utils.py in f()
322 if callback_timeout is not None:
323 future = asyncio.wait_for(future, callback_timeout)
--> 324 result[0] = yield future
325 except Exception as exc:
326 error[0] = sys.exc_info()
~/miniconda/lib/python3.8/site-packages/tornado/gen.py in run(self)
760
761 try:
--> 762 value = future.result()
763 except Exception:
764 exc_info = sys.exc_info()
~/miniconda/lib/python3.8/site-packages/coiled/cluster.py in _scale(self, n)
329
330 async def _scale(self, n: int) -> Tuple[List[Dict], List[Dict]]:
--> 331 return await self.cloud.scale(
332 account=self.account,
333 cluster_id=self.cluster_id, # type: ignore
~/miniconda/lib/python3.8/site-packages/coiled/core.py in _scale(self, cluster_id, n, account)
506 if response.status >= 400:
507 text = await response.text()
--> 508 raise Exception(text)
509
510 def scale(self, cluster_id: int, n: int, account: str = None) -> None:
Exception: <html>
<head><title>502 Bad Gateway</title></head>
<body>
<center><h1>502 Bad Gateway</h1></center>
</body>
</html>
We made the client pretty robust to 502s!