Ray is a unified framework for scaling AI and Python applications. Ray consists of a core distributed runtime and a set of AI Libraries for accelerating ML workloads.
We are using Ray with KubeRay, and have started to get issues with connection timeouts to kubernetes.default,
which seems to brake the autoscaler. It happens almost daily, or every second day.
We get these error messages repeatedly in our logs:
Traceback (most recent call last):
File "/deploy/venv/lib/python3.10/site-packages/ray/autoscaler/_private/monitor.py", line 547, in run
self._initialize_autoscaler()
File "/deploy/venv/lib/python3.10/site-packages/ray/autoscaler/_private/monitor.py", line 233, in _initialize_autoscaler
self.autoscaler = StandardAutoscaler(
File "/deploy/venv/lib/python3.10/site-packages/ray/autoscaler/_private/autoscaler.py", line 247, in __init__
self.reset(errors_fatal=True)
File "/deploy/venv/lib/python3.10/site-packages/ray/autoscaler/_private/autoscaler.py", line 1107, in reset
raise e
File "/deploy/venv/lib/python3.10/site-packages/ray/autoscaler/_private/autoscaler.py", line 1024, in reset
new_config = self.config_reader()
File "/deploy/venv/lib/python3.10/site-packages/ray/autoscaler/_private/kuberay/autoscaling_config.py", line 59, in __call__
ray_cr = self._fetch_ray_cr_from_k8s_with_retries()
File "/deploy/venv/lib/python3.10/site-packages/ray/autoscaler/_private/kuberay/autoscaling_config.py", line 71, in _fetch_ray_cr_from_k8s_with_retries
return self._fetch_ray_cr_from_k8s()
File "/deploy/venv/lib/python3.10/site-packages/ray/autoscaler/_private/kuberay/autoscaling_config.py", line 85, in _fetch_ray_cr_from_k8s
result = requests.get(
File "/deploy/venv/lib/python3.10/site-packages/requests/api.py", line 73, in get
return request("get", url, params=params, **kwargs)
File "/deploy/venv/lib/python3.10/site-packages/requests/api.py", line 59, in request
return session.request(method=method, url=url, **kwargs)
File "/deploy/venv/lib/python3.10/site-packages/requests/sessions.py", line 587, in request
resp = self.send(prep, **send_kwargs)
File "/deploy/venv/lib/python3.10/site-packages/requests/sessions.py", line 701, in send
r = adapter.send(request, **kwargs)
File "/deploy/venv/lib/python3.10/site-packages/requests/adapters.py", line 553, in send
raise ConnectTimeout(e, request=request)
requests.exceptions.ConnectTimeout: HTTPSConnectionPool(host='kubernetes.default', port=443): Max retries exceeded with url: /apis/ray.io/v1alpha1/namespaces/default/rayclusters/ray-djmg5 (Caused by ConnectTimeoutError(<urllib3.connection.HTTPSConnection object at 0x7f3b2a3fc370>, 'Connection to kubernetes.default timed out. (connect timeout=None)'))
Traceback (most recent call last):
File "/deploy/venv/bin/ray", line 8, in <module>
sys.exit(main())
File "/deploy/venv/lib/python3.10/site-packages/ray/scripts/scripts.py", line 2422, in main
return cli()
File "/deploy/venv/lib/python3.10/site-packages/click/core.py", line 1130, in __call__
return self.main(*args, **kwargs)
File "/deploy/venv/lib/python3.10/site-packages/click/core.py", line 1055, in main
rv = self.invoke(ctx)
File "/deploy/venv/lib/python3.10/site-packages/click/core.py", line 1657, in invoke
return _process_result(sub_ctx.command.invoke(sub_ctx))
File "/deploy/venv/lib/python3.10/site-packages/click/core.py", line 1404, in invoke
return ctx.invoke(self.callback, **ctx.params)
File "/deploy/venv/lib/python3.10/site-packages/click/core.py", line 760, in invoke
return __callback(*args, **kwargs)
File "/deploy/venv/lib/python3.10/site-packages/ray/scripts/scripts.py", line 2168, in kuberay_autoscaler
run_kuberay_autoscaler(cluster_name, cluster_namespace)
File "/deploy/venv/lib/python3.10/site-packages/ray/autoscaler/_private/kuberay/run_autoscaler.py", line 64, in run_kuberay_autoscaler
).run()
File "/deploy/venv/lib/python3.10/site-packages/ray/autoscaler/_private/monitor.py", line 547, in run
self._initialize_autoscaler()
File "/deploy/venv/lib/python3.10/site-packages/ray/autoscaler/_private/monitor.py", line 233, in _initialize_autoscaler
self.autoscaler = StandardAutoscaler(
File "/deploy/venv/lib/python3.10/site-packages/ray/autoscaler/_private/autoscaler.py", line 247, in __init__
self.reset(errors_fatal=True)
File "/deploy/venv/lib/python3.10/site-packages/ray/autoscaler/_private/autoscaler.py", line 1107, in reset
raise e
File "/deploy/venv/lib/python3.10/site-packages/ray/autoscaler/_private/autoscaler.py", line 1024, in reset
new_config = self.config_reader()
File "/deploy/venv/lib/python3.10/site-packages/ray/autoscaler/_private/kuberay/autoscaling_config.py", line 59, in __call__
ray_cr = self._fetch_ray_cr_from_k8s_with_retries()
File "/deploy/venv/lib/python3.10/site-packages/ray/autoscaler/_private/kuberay/autoscaling_config.py", line 71, in _fetch_ray_cr_from_k8s_with_retries
return self._fetch_ray_cr_from_k8s()
File "/deploy/venv/lib/python3.10/site-packages/ray/autoscaler/_private/kuberay/autoscaling_config.py", line 85, in _fetch_ray_cr_from_k8s
result = requests.get(
File "/deploy/venv/lib/python3.10/site-packages/requests/api.py", line 73, in get
return request("get", url, params=params, **kwargs)
File "/deploy/venv/lib/python3.10/site-packages/requests/api.py", line 59, in request
return session.request(method=method, url=url, **kwargs)
File "/deploy/venv/lib/python3.10/site-packages/requests/sessions.py", line 587, in request
resp = self.send(prep, **send_kwargs)
File "/deploy/venv/lib/python3.10/site-packages/requests/sessions.py", line 701, in send
r = adapter.send(request, **kwargs)
File "/deploy/venv/lib/python3.10/site-packages/requests/adapters.py", line 553, in send
raise ConnectTimeout(e, request=request)
requests.exceptions.ConnectTimeout: HTTPSConnectionPool(host='kubernetes.default', port=443): Max retries exceeded with url: /apis/ray.io/v1alpha1/namespaces/default/rayclusters/ray-djmg5 (Caused by ConnectTimeoutError(<urllib3.connection.HTTPSConnection object at 0x7f3b2a3fc370>, 'Connection to kubernetes.default timed out. (connect timeout=None)'))
Traceback (most recent call last):
File "/deploy/venv/lib/python3.10/site-packages/requests/adapters.py", line 489, in send
resp = conn.urlopen(
File "/deploy/venv/lib/python3.10/site-packages/urllib3/connectionpool.py", line 787, in urlopen
retries = retries.increment(
File "/deploy/venv/lib/python3.10/site-packages/urllib3/util/retry.py", line 592, in increment
raise MaxRetryError(_pool, url, error or ResponseError(cause))
urllib3.exceptions.MaxRetryError: HTTPSConnectionPool(host='kubernetes.default', port=443): Max retries exceeded with url: /apis/ray.io/v1alpha1/namespaces/default/rayclusters/ray-djmg5 (Caused by ConnectTimeoutError(<urllib3.connection.HTTPSConnection object at 0x7f3b2a3fc370>, 'Connection to kubernetes.default timed out. (connect timeout=None)'))
Versions / Dependencies
ray version: 2.3.0
os: Linux (ubuntu22.04)
python version: 3.10
Reproduction script
No reproduction script.
Issue Severity
Medium: It is a significant difficulty but I can work around it.
What happened + What you expected to happen
We are using Ray with KubeRay, and have started to get issues with connection timeouts to kubernetes.default, which seems to brake the autoscaler. It happens almost daily, or every second day.
We get these error messages repeatedly in our logs:
Versions / Dependencies
ray version: 2.3.0 os: Linux (ubuntu22.04) python version: 3.10
Reproduction script
No reproduction script.
Issue Severity
Medium: It is a significant difficulty but I can work around it.