The best I can do is to create one worker at a time by progressively calling request_resources(num_cpus=XX).
Let me know if you need more info.
File: monitor.err
`Exception in thread Exception in thread Thread-5 (spawn_updater):
Thread-4 (spawn_updater):
Traceback (most recent call last):
Traceback (most recent call last):
File "/opt/conda/lib/python3.10/threading.py", line 1016, in _bootstrap_inner
File "/opt/conda/lib/python3.10/threading.py", line 1016, in _bootstrap_inner
self.run()self.run()
File "/opt/conda/lib/python3.10/threading.py", line 953, in run
File "/opt/conda/lib/python3.10/threading.py", line 953, in run
self._target(*self._args, *self._kwargs)self._target(self._args, **self._kwargs)
File "/opt/conda/lib/python3.10/site-packages/ray/autoscaler/_private/autoscaler.py", line 1338, in spawn_updater
File "/opt/conda/lib/python3.10/site-packages/ray/autoscaler/_private/autoscaler.py", line 1338, in spawn_updater
updater = NodeUpdaterThread(updater = NodeUpdaterThread(
File "/opt/conda/lib/python3.10/site-packages/ray/autoscaler/_private/updater.py", line 565, in init
File "/opt/conda/lib/python3.10/site-packages/ray/autoscaler/_private/updater.py", line 565, in init
NodeUpdater.init(self, *args, *kwargs)NodeUpdater.init(self, args, **kwargs)
File "/opt/conda/lib/python3.10/site-packages/ray/autoscaler/_private/updater.py", line 96, in init
File "/opt/conda/lib/python3.10/site-packages/ray/autoscaler/_private/updater.py", line 96, in init
self.cmd_runner = provider.get_command_runner(
self.cmd_runner = provider.get_command_runner(
File "/opt/conda/lib/python3.10/site-packages/ray/autoscaler/_private/gcp/node_provider.py", line 334, in get_command_runner
File "/opt/conda/lib/python3.10/site-packages/ray/autoscaler/_private/gcp/node_provider.py", line 334, in get_command_runner
instance = resource.get_instance(node_id)
File "/opt/conda/lib/python3.10/site-packages/ray/autoscaler/_private/gcp/node.py", line 443, in get_instance
instance = resource.get_instance(node_id)
File "/opt/conda/lib/python3.10/site-packages/ray/autoscaler/_private/gcp/node.py", line 443, in get_instance
.execute()
File "/opt/conda/lib/python3.10/site-packages/googleapiclient/_helpers.py", line 130, in positional_wrapper
.execute()
File "/opt/conda/lib/python3.10/site-packages/googleapiclient/_helpers.py", line 130, in positional_wrapper
return wrapped(*args, *kwargs)
File "/opt/conda/lib/python3.10/site-packages/googleapiclient/http.py", line 844, in execute
return wrapped(args, **kwargs)
File "/opt/conda/lib/python3.10/site-packages/googleapiclient/http.py", line 844, in execute
resp, content = _retry_request(
File "/opt/conda/lib/python3.10/site-packages/googleapiclient/http.py", line 183, in _retry_request
resp, content = _retry_request(
raise exception File "/opt/conda/lib/python3.10/site-packages/googleapiclient/http.py", line 183, in _retry_request
File "/opt/conda/lib/python3.10/site-packages/googleapiclient/http.py", line 164, in _retry_request
raise exception
File "/opt/conda/lib/python3.10/site-packages/googleapiclient/http.py", line 164, in _retry_request
resp, content = http.request(uri, method, *args, *kwargs)resp, content = http.request(uri, method, args, **kwargs)
File "/opt/conda/lib/python3.10/site-packages/google_auth_httplib2.py", line 218, in request
File "/opt/conda/lib/python3.10/site-packages/google_auth_httplib2.py", line 218, in request
response, content = self.http.request(
File "/opt/conda/lib/python3.10/site-packages/httplib2/init.py", line 1724, in request
response, content = self.http.request(
File "/opt/conda/lib/python3.10/site-packages/httplib2/init.py", line 1724, in request
(response, content) = self._request((response, content) = self._request(
File "/opt/conda/lib/python3.10/site-packages/httplib2/init.py", line 1444, in _request
File "/opt/conda/lib/python3.10/site-packages/httplib2/init.py", line 1444, in _request
(response, content) = self._conn_request(conn, request_uri, method, body, headers)(response, content) = self._conn_request(conn, request_uri, method, body, headers)
File "/opt/conda/lib/python3.10/site-packages/httplib2/init.py", line 1396, in _conn_request
File "/opt/conda/lib/python3.10/site-packages/httplib2/init.py", line 1396, in _conn_request
response = conn.getresponse()
File "/opt/conda/lib/python3.10/http/client.py", line 1375, in getresponse
response = conn.getresponse()
File "/opt/conda/lib/python3.10/http/client.py", line 1375, in getresponse
response.begin()response.begin()
File "/opt/conda/lib/python3.10/http/client.py", line 318, in begin
File "/opt/conda/lib/python3.10/http/client.py", line 318, in begin
version, status, reason = self._read_status()version, status, reason = self._read_status()
File "/opt/conda/lib/python3.10/http/client.py", line 279, in _read_status
File "/opt/conda/lib/python3.10/http/client.py", line 279, in _read_status
line = str(self.fp.readline(_MAXLINE + 1), "iso-8859-1")
line = str(self.fp.readline(_MAXLINE + 1), "iso-8859-1") File "/opt/conda/lib/python3.10/socket.py", line 705, in readinto
File "/opt/conda/lib/python3.10/socket.py", line 705, in readinto
return self._sock.recv_into(b)return self._sock.recv_into(b)
File "/opt/conda/lib/python3.10/ssl.py", line 1307, in recv_into
File "/opt/conda/lib/python3.10/ssl.py", line 1307, in recv_into
return self.read(nbytes, buffer)return self.read(nbytes, buffer)
File "/opt/conda/lib/python3.10/ssl.py", line 1163, in read
File "/opt/conda/lib/python3.10/ssl.py", line 1163, in read
return self._sslobj.read(len, buffer)
ssl .return self._sslobj.read(len, buffer)SSLError
: ssl[SSL] record layer failure (_ssl.c:2578).
SSLError: [SSL] record layer failure (_ssl.c:2578)`
What happened + What you expected to happen
The GCP autoscaler fails to create worker nodes. I am using the example-minimal.yaml file to replicate.
The final exception varies, but it is always related to SSL and httplib2. I saw those errors:
TimeoutError: The read operation timed out
[SSL] record layer failure
[SSL: WRONG_VERSION_NUMBER] wrong version number
This issue seems related to #4132 , #4072 and the threat-safety issues of the underlying httplib2 library: https://github.com/googleapis/google-api-python-client/blob/main/docs/thread_safety.md
The best I can do is to create one worker at a time by progressively calling
request_resources(num_cpus=XX)
.Let me know if you need more info.
File: monitor.err `Exception in thread Exception in thread Thread-5 (spawn_updater): Thread-4 (spawn_updater): Traceback (most recent call last): Traceback (most recent call last): File "/opt/conda/lib/python3.10/threading.py", line 1016, in _bootstrap_inner File "/opt/conda/lib/python3.10/threading.py", line 1016, in _bootstrap_inner self.run()self.run()
File "/opt/conda/lib/python3.10/threading.py", line 953, in run File "/opt/conda/lib/python3.10/threading.py", line 953, in run self._target(*self._args, *self._kwargs)self._target(self._args, **self._kwargs)
File "/opt/conda/lib/python3.10/site-packages/ray/autoscaler/_private/autoscaler.py", line 1338, in spawn_updater File "/opt/conda/lib/python3.10/site-packages/ray/autoscaler/_private/autoscaler.py", line 1338, in spawn_updater updater = NodeUpdaterThread(updater = NodeUpdaterThread(
File "/opt/conda/lib/python3.10/site-packages/ray/autoscaler/_private/updater.py", line 565, in init File "/opt/conda/lib/python3.10/site-packages/ray/autoscaler/_private/updater.py", line 565, in init NodeUpdater.init(self, *args, *kwargs)NodeUpdater.init(self, args, **kwargs)
File "/opt/conda/lib/python3.10/site-packages/ray/autoscaler/_private/updater.py", line 96, in init File "/opt/conda/lib/python3.10/site-packages/ray/autoscaler/_private/updater.py", line 96, in init self.cmd_runner = provider.get_command_runner(
self.cmd_runner = provider.get_command_runner( File "/opt/conda/lib/python3.10/site-packages/ray/autoscaler/_private/gcp/node_provider.py", line 334, in get_command_runner File "/opt/conda/lib/python3.10/site-packages/ray/autoscaler/_private/gcp/node_provider.py", line 334, in get_command_runner instance = resource.get_instance(node_id) File "/opt/conda/lib/python3.10/site-packages/ray/autoscaler/_private/gcp/node.py", line 443, in get_instance instance = resource.get_instance(node_id) File "/opt/conda/lib/python3.10/site-packages/ray/autoscaler/_private/gcp/node.py", line 443, in get_instance .execute() File "/opt/conda/lib/python3.10/site-packages/googleapiclient/_helpers.py", line 130, in positional_wrapper .execute() File "/opt/conda/lib/python3.10/site-packages/googleapiclient/_helpers.py", line 130, in positional_wrapper return wrapped(*args, *kwargs) File "/opt/conda/lib/python3.10/site-packages/googleapiclient/http.py", line 844, in execute return wrapped(args, **kwargs) File "/opt/conda/lib/python3.10/site-packages/googleapiclient/http.py", line 844, in execute resp, content = _retry_request( File "/opt/conda/lib/python3.10/site-packages/googleapiclient/http.py", line 183, in _retry_request resp, content = _retry_request(
raise exception File "/opt/conda/lib/python3.10/site-packages/googleapiclient/http.py", line 183, in _retry_request
File "/opt/conda/lib/python3.10/site-packages/googleapiclient/http.py", line 164, in _retry_request raise exception File "/opt/conda/lib/python3.10/site-packages/googleapiclient/http.py", line 164, in _retry_request resp, content = http.request(uri, method, *args, *kwargs)resp, content = http.request(uri, method, args, **kwargs)
File "/opt/conda/lib/python3.10/site-packages/google_auth_httplib2.py", line 218, in request File "/opt/conda/lib/python3.10/site-packages/google_auth_httplib2.py", line 218, in request response, content = self.http.request( File "/opt/conda/lib/python3.10/site-packages/httplib2/init.py", line 1724, in request response, content = self.http.request( File "/opt/conda/lib/python3.10/site-packages/httplib2/init.py", line 1724, in request (response, content) = self._request((response, content) = self._request(
File "/opt/conda/lib/python3.10/site-packages/httplib2/init.py", line 1444, in _request File "/opt/conda/lib/python3.10/site-packages/httplib2/init.py", line 1444, in _request (response, content) = self._conn_request(conn, request_uri, method, body, headers)(response, content) = self._conn_request(conn, request_uri, method, body, headers)
File "/opt/conda/lib/python3.10/site-packages/httplib2/init.py", line 1396, in _conn_request File "/opt/conda/lib/python3.10/site-packages/httplib2/init.py", line 1396, in _conn_request response = conn.getresponse() File "/opt/conda/lib/python3.10/http/client.py", line 1375, in getresponse response = conn.getresponse() File "/opt/conda/lib/python3.10/http/client.py", line 1375, in getresponse response.begin()response.begin()
File "/opt/conda/lib/python3.10/http/client.py", line 318, in begin File "/opt/conda/lib/python3.10/http/client.py", line 318, in begin version, status, reason = self._read_status()version, status, reason = self._read_status()
File "/opt/conda/lib/python3.10/http/client.py", line 279, in _read_status File "/opt/conda/lib/python3.10/http/client.py", line 279, in _read_status line = str(self.fp.readline(_MAXLINE + 1), "iso-8859-1")
line = str(self.fp.readline(_MAXLINE + 1), "iso-8859-1") File "/opt/conda/lib/python3.10/socket.py", line 705, in readinto
File "/opt/conda/lib/python3.10/socket.py", line 705, in readinto return self._sock.recv_into(b)return self._sock.recv_into(b)
File "/opt/conda/lib/python3.10/ssl.py", line 1307, in recv_into File "/opt/conda/lib/python3.10/ssl.py", line 1307, in recv_into return self.read(nbytes, buffer)return self.read(nbytes, buffer)
File "/opt/conda/lib/python3.10/ssl.py", line 1163, in read File "/opt/conda/lib/python3.10/ssl.py", line 1163, in read return self._sslobj.read(len, buffer) ssl .return self._sslobj.read(len, buffer)SSLError : ssl[SSL] record layer failure (_ssl.c:2578). SSLError: [SSL] record layer failure (_ssl.c:2578)`
Versions / Dependencies
ray[default]==2.9.3 or nightly
Reproduction script
python/ray/autoscaler/gcp/example-minimal.yaml
Issue Severity
High: It blocks me from completing my task.