Closed TomAugspurger closed 4 years ago
Seemingly some issues with the prod deployment as well.
from dask_gateway import Gateway
gateway = Gateway()
cluster = gateway.new_cluster()
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-3-a52233bc4e18> in <module>
1 from dask_gateway import Gateway
2 gateway = Gateway()
----> 3 cluster = gateway.new_cluster()
/srv/conda/envs/notebook/lib/python3.7/site-packages/dask_gateway/client.py in new_cluster(self, cluster_options, shutdown_on_close, **kwargs)
641 cluster_options=cluster_options,
642 shutdown_on_close=shutdown_on_close,
--> 643 **kwargs,
644 )
645
/srv/conda/envs/notebook/lib/python3.7/site-packages/dask_gateway/client.py in __init__(self, address, proxy_address, public_address, auth, cluster_options, shutdown_on_close, asynchronous, loop, **kwargs)
816 shutdown_on_close=shutdown_on_close,
817 asynchronous=asynchronous,
--> 818 loop=loop,
819 )
820
/srv/conda/envs/notebook/lib/python3.7/site-packages/dask_gateway/client.py in _init_internal(self, address, proxy_address, public_address, auth, cluster_options, cluster_kwargs, shutdown_on_close, asynchronous, loop, name)
912 self.status = "starting"
913 if not self.asynchronous:
--> 914 self.gateway.sync(self._start_internal)
915
916 @property
/srv/conda/envs/notebook/lib/python3.7/site-packages/dask_gateway/client.py in sync(self, func, *args, **kwargs)
337 )
338 try:
--> 339 return future.result()
340 except BaseException:
341 future.cancel()
/srv/conda/envs/notebook/lib/python3.7/concurrent/futures/_base.py in result(self, timeout)
433 raise CancelledError()
434 elif self._state == FINISHED:
--> 435 return self.__get_result()
436 else:
437 raise TimeoutError()
/srv/conda/envs/notebook/lib/python3.7/concurrent/futures/_base.py in __get_result(self)
382 def __get_result(self):
383 if self._exception:
--> 384 raise self._exception
385 else:
386 return self._result
/srv/conda/envs/notebook/lib/python3.7/site-packages/dask_gateway/client.py in _start_internal(self)
926 self._start_task = asyncio.ensure_future(self._start_async())
927 try:
--> 928 await self._start_task
929 except BaseException:
930 # On exception, cleanup
/srv/conda/envs/notebook/lib/python3.7/site-packages/dask_gateway/client.py in _start_async(self)
940 self.status = "starting"
941 self.name = await self.gateway._submit(
--> 942 cluster_options=self._cluster_options, **self._cluster_kwargs
943 )
944 # Connect to cluster
/srv/conda/envs/notebook/lib/python3.7/site-packages/dask_gateway/client.py in _submit(self, cluster_options, **kwargs)
529 options = self._config_cluster_options()
530 options.update(kwargs)
--> 531 resp = await self._request("POST", url, json={"cluster_options": options})
532 data = await resp.json()
533 return data["name"]
/srv/conda/envs/notebook/lib/python3.7/site-packages/dask_gateway/client.py in _request(self, method, url, json)
407
408 if resp.status in {404, 422}:
--> 409 raise ValueError(msg)
410 elif resp.status == 409:
411 raise GatewayClusterError(msg)
ValueError: 404 page not found
I'll see if I still have access to debug this.
On the Dask Gateway controller
[E 2020-10-14 13:56:25.183 KubeController] Error in cluster informer, retrying...
Traceback (most recent call last):
File "/opt/conda/lib/python3.7/site-packages/dask_gateway_server/backends/kubernetes/utils.py", line 161, in run
initial = await method(**self.method_kwargs)
File "/opt/conda/lib/python3.7/site-packages/dask_gateway_server/backends/kubernetes/utils.py", line 59, in func
return await method(*args, **kwargs)
File "/opt/conda/lib/python3.7/site-packages/kubernetes_asyncio/client/api_client.py", line 166, in __call_api
_request_timeout=_request_timeout)
File "/opt/conda/lib/python3.7/site-packages/kubernetes_asyncio/client/rest.py", line 191, in GET
query_params=query_params))
File "/opt/conda/lib/python3.7/site-packages/kubernetes_asyncio/client/rest.py", line 181, in request
raise ApiException(http_resp=r)
kubernetes_asyncio.client.rest.ApiException: (404)
Reason: Not Found
HTTP response headers: <CIMultiDictProxy('Audit-Id': 'c9a49984-04ff-45b8-8d8e-dfb9e9746ce4', 'Content-Type': 'text/plain; charset=utf-8', 'X-Content-Type-Options': 'nosniff', 'Date': 'Wed, 14 Oct 2020 13:56:25 GMT', 'Content-Length': '19')>
HTTP response body: 404 page not found
I restarted the pod, but same thing.
I wonder if this is a kubernetes version thing? @jcrist do you know if there's a minimum version of kubernetes that dask-gateway supports? This server is on Server Version: v1.15.11-eks-065dce
.
edit: the dask-gateway API pod has similar messages
[E 2020-10-14 14:00:30.063 DaskGateway] Error in cluster informer, retrying...
Traceback (most recent call last):
File "/opt/conda/lib/python3.7/site-packages/dask_gateway_server/backends/kubernetes/utils.py", line 161, in run
initial = await method(**self.method_kwargs)
File "/opt/conda/lib/python3.7/site-packages/kubernetes_asyncio/client/api_client.py", line 166, in __call_api
_request_timeout=_request_timeout)
File "/opt/conda/lib/python3.7/site-packages/kubernetes_asyncio/client/rest.py", line 191, in GET
query_params=query_params))
File "/opt/conda/lib/python3.7/site-packages/kubernetes_asyncio/client/rest.py", line 181, in request
raise ApiException(http_resp=r)
kubernetes_asyncio.client.rest.ApiException: (404)
Reason: Not Found
HTTP response headers: <CIMultiDictProxy('Audit-Id': 'dfecd26a-26f0-4d80-ae01-16536bcead4e', 'Content-Type': 'text/plain; charset=utf-8', 'X-Content-Type-Options': 'nosniff', 'Date': 'Wed, 14 Oct 2020 14:00:30 GMT', 'Content-Length': '19')>
HTTP response body: 404 page not found
And the traefik pod has
E1014 14:01:57.779261 1 reflector.go:125] pkg/mod/k8s.io/client-go@v0.0.0-20190718183610-8e956561bbf5/tools/cache/reflector.go:98: Failed to list *v1alpha1.IngressRouteTCP: the server could not find the requested resource (get ingressroutetcps.traefik.containo.us)
E1014 14:01:58.664475 1 reflector.go:125] pkg/mod/k8s.io/client-go@v0.0.0-20190718183610-8e956561bbf5/tools/cache/reflector.go:98: Failed to list *v1alpha1.TraefikService: the server could not find the requested resource (get traefikservices.traefik.containo.us)
E1014 14:01:58.740087 1 reflector.go:125] pkg/mod/k8s.io/client-go@v0.0.0-20190718183610-8e956561bbf5/tools/cache/reflector.go:98: Failed to list *v1alpha1.Middleware: the server could not find the requested resource (get middlewares.traefik.containo.us)
E1014 14:01:58.753002 1 reflector.go:125] pkg/mod/k8s.io/client-go@v0.0.0-20190718183610-8e956561bbf5/tools/cache/reflector.go:98: Failed to list *v1alpha1.IngressRoute: the server could not find the requested resource (get ingressroutes.traefik.containo.us)
E1014 14:01:58.778518 1 reflector.go:125] pkg/mod/k8s.io/client-go@v0.0.0-20190718183610-8e956561bbf5/tools/cache/reflector.go:98: Failed to list *v1alpha1.TLSOption: the server could not find the requested resource (get tlsoptions.traefik.containo.us)
E1014 14:01:58.781006 1 reflector.go:125] pkg/mod/k8s.io/client-go@v0.0.0-20190718183610-8e956561bbf5/tools/cache/reflector.go:98: Failed to list *v1alpha1.IngressRouteTCP: the server could not find the requested resource (get ingressroutetcps.traefik.containo.us)
Ah, maybe the CRDs didn't install properly? kubectl get daskcluster -o yaml
fails.
OK, I think we're good now. Installing the CRDs and redeploying seemed to do the trick.
Glad you figured this out.
https://app.circleci.com/pipelines/github/pangeo-data/pangeo-binder/164/workflows/4133b01c-b22d-4ce3-9c0c-3267f7445f03/jobs/171
Another failure @salvis2, probably from prometheus-operator? We'll want to drop it here as well :/