An open source AutoML toolkit for automate machine learning lifecycle, including feature engineering, neural architecture search, model compression and hyper-parameter tuning.
Describe the issue: I am running multiple NNI experiments on my university's server at the same time (7 experiments, each using one GPU, for 7 days). Every experiment failed at about the same time with the same error. Any idea what might have caused this?
[2023-10-03 10:33:22] [31mERROR: Strategy failed to execute.[0m
[2023-10-03 10:35:40] [31mERROR: Failed to receive command. Retry in 0s[0m
Traceback (most recent call last):
File "/home/lmarreiros/.cache/pypoetry/virtualenvs/omnia-local-AEBrPPsi-py3.9/lib/python3.9/site-packages/websockets/legacy/protocol.py", line 959, in transfer_data
message = await self.read_message()
File "/home/lmarreiros/.cache/pypoetry/virtualenvs/omnia-local-AEBrPPsi-py3.9/lib/python3.9/site-packages/websockets/legacy/protocol.py", line 1029, in read_message
frame = await self.read_data_frame(max_size=self.max_size)
File "/home/lmarreiros/.cache/pypoetry/virtualenvs/omnia-local-AEBrPPsi-py3.9/lib/python3.9/site-packages/websockets/legacy/protocol.py", line 1104, in read_data_frame
frame = await self.read_frame(max_size)
File "/home/lmarreiros/.cache/pypoetry/virtualenvs/omnia-local-AEBrPPsi-py3.9/lib/python3.9/site-packages/websockets/legacy/protocol.py", line 1161, in read_frame
frame = await Frame.read(
File "/home/lmarreiros/.cache/pypoetry/virtualenvs/omnia-local-AEBrPPsi-py3.9/lib/python3.9/site-packages/websockets/legacy/framing.py", line 68, in read
data = await reader(2)
File "/home/lmarreiros/miniconda3/lib/python3.9/asyncio/streams.py", line 723, in readexactly
await self._wait_for_data('readexactly')
File "/home/lmarreiros/miniconda3/lib/python3.9/asyncio/streams.py", line 517, in _wait_for_data
await self._waiter
asyncio.exceptions.CancelledError
The above exception was the direct cause of the following exception:
Traceback (most recent call last):
File "/home/lmarreiros/.cache/pypoetry/virtualenvs/omnia-local-AEBrPPsi-py3.9/lib/python3.9/site-packages/nni/runtime/command_channel/websocket/channel.py", line 99, in _receive_command
command = conn.receive()
File "/home/lmarreiros/.cache/pypoetry/virtualenvs/omnia-local-AEBrPPsi-py3.9/lib/python3.9/site-packages/nni/runtime/command_channel/websocket/connection.py", line 103, in receive
msg = _wait(self._ws.recv())
File "/home/lmarreiros/.cache/pypoetry/virtualenvs/omnia-local-AEBrPPsi-py3.9/lib/python3.9/site-packages/nni/runtime/command_channel/websocket/connection.py", line 121, in _wait
return future.result()
File "/home/lmarreiros/miniconda3/lib/python3.9/concurrent/futures/_base.py", line 446, in result
return self.get_result()
File "/home/lmarreiros/miniconda3/lib/python3.9/concurrent/futures/_base.py", line 391, in get_result
raise self._exception
File "/home/lmarreiros/.cache/pypoetry/virtualenvs/omnia-local-AEBrPPsi-py3.9/lib/python3.9/site-packages/websockets/legacy/protocol.py", line 568, in recv
await self.ensure_open()
File "/home/lmarreiros/.cache/pypoetry/virtualenvs/omnia-local-AEBrPPsi-py3.9/lib/python3.9/site-packages/websockets/legacy/protocol.py", line 944, in ensure_open
raise self.connection_closed_exc()
websockets.exceptions.ConnectionClosedError: sent 1011 (unexpected error) keepalive ping timeout; no close frame received
[2023-10-03 10:36:15] [32mStopping experiment, please wait...[0m
[2023-10-03 10:36:17] [31mERROR: Failed to receive command. Retry in 1s[0m
Traceback (most recent call last):
File "/home/lmarreiros/.cache/pypoetry/virtualenvs/omnia-local-AEBrPPsi-py3.9/lib/python3.9/site-packages/nni/runtime/command_channel/websocket/channel.py", line 98, in _receive_command
conn = self._ensure_conn()
File "/home/lmarreiros/.cache/pypoetry/virtualenvs/omnia-local-AEBrPPsi-py3.9/lib/python3.9/site-packages/nni/runtime/command_channel/websocket/channel.py", line 75, in _ensure_conn
self._conn.connect()
File "/home/lmarreiros/.cache/pypoetry/virtualenvs/omnia-local-AEBrPPsi-py3.9/lib/python3.9/site-packages/nni/runtime/command_channel/websocket/connection.py", line 65, in connect
self._ws = _wait(_connect_async(self._url))
File "/home/lmarreiros/.cache/pypoetry/virtualenvs/omnia-local-AEBrPPsi-py3.9/lib/python3.9/site-packages/nni/runtime/command_channel/websocket/connection.py", line 121, in _wait
return future.result()
File "/home/lmarreiros/miniconda3/lib/python3.9/concurrent/futures/_base.py", line 446, in result
return self.get_result()
File "/home/lmarreiros/miniconda3/lib/python3.9/concurrent/futures/_base.py", line 391, in get_result
raise self._exception
File "/home/lmarreiros/.cache/pypoetry/virtualenvs/omnia-local-AEBrPPsi-py3.9/lib/python3.9/site-packages/nni/runtime/command_channel/websocket/connection.py", line 135, in _connect_async
return await websockets.connect(url, max_size=None) # type: ignore
File "/home/lmarreiros/.cache/pypoetry/virtualenvs/omnia-local-AEBrPPsi-py3.9/lib/python3.9/site-packages/websockets/legacy/client.py", line 655, in await_impl_timeout
return await self.await_impl()
File "/home/lmarreiros/.cache/pypoetry/virtualenvs/omnia-local-AEBrPPsi-py3.9/lib/python3.9/site-packages/websockets/legacy/client.py", line 659, in await_impl
_transport, _protocol = await self._create_connection()
File "/home/lmarreiros/miniconda3/lib/python3.9/asyncio/base_events.py", line 1026, in create_connection
infos = await self._ensure_resolved(
File "/home/lmarreiros/miniconda3/lib/python3.9/asyncio/base_events.py", line 1405, in _ensure_resolved
return await loop.getaddrinfo(host, port, family=family, type=type,
File "/home/lmarreiros/miniconda3/lib/python3.9/asyncio/base_events.py", line 861, in getaddrinfo
return await self.run_in_executor(
File "/home/lmarreiros/miniconda3/lib/python3.9/asyncio/base_events.py", line 819, in run_in_executor
executor.submit(func, args), loop=self)
File "/home/lmarreiros/miniconda3/lib/python3.9/concurrent/futures/thread.py", line 169, in submit
raise RuntimeError('cannot schedule new futures after '
RuntimeError: cannot schedule new futures after interpreter shutdown
[2023-10-03 10:36:49] [31mERROR: Failed to receive command. Retry in 2s[0m
Traceback (most recent call last):
File "/home/lmarreiros/.cache/pypoetry/virtualenvs/omnia-local-AEBrPPsi-py3.9/lib/python3.9/site-packages/nni/runtime/command_channel/websocket/channel.py", line 98, in _receive_command
conn = self._ensure_conn()
File "/home/lmarreiros/.cache/pypoetry/virtualenvs/omnia-local-AEBrPPsi-py3.9/lib/python3.9/site-packages/nni/runtime/command_channel/websocket/channel.py", line 75, in _ensure_conn
self._conn.connect()
File "/home/lmarreiros/.cache/pypoetry/virtualenvs/omnia-local-AEBrPPsi-py3.9/lib/python3.9/site-packages/nni/runtime/command_channel/websocket/connection.py", line 65, in connect
self._ws = _wait(_connect_async(self._url))
File "/home/lmarreiros/.cache/pypoetry/virtualenvs/omnia-local-AEBrPPsi-py3.9/lib/python3.9/site-packages/nni/runtime/command_channel/websocket/connection.py", line 121, in _wait
return future.result()
File "/home/lmarreiros/miniconda3/lib/python3.9/concurrent/futures/_base.py", line 446, in result
return self.get_result()
File "/home/lmarreiros/miniconda3/lib/python3.9/concurrent/futures/_base.py", line 391, in get_result
raise self._exception
File "/home/lmarreiros/.cache/pypoetry/virtualenvs/omnia-local-AEBrPPsi-py3.9/lib/python3.9/site-packages/nni/runtime/command_channel/websocket/connection.py", line 135, in _connect_async
return await websockets.connect(url, max_size=None) # type: ignore
File "/home/lmarreiros/.cache/pypoetry/virtualenvs/omnia-local-AEBrPPsi-py3.9/lib/python3.9/site-packages/websockets/legacy/client.py", line 655, in await_impl_timeout
return await self.await_impl()
File "/home/lmarreiros/.cache/pypoetry/virtualenvs/omnia-local-AEBrPPsi-py3.9/lib/python3.9/site-packages/websockets/legacy/client.py", line 659, in await_impl
_transport, _protocol = await self._create_connection()
File "/home/lmarreiros/miniconda3/lib/python3.9/asyncio/base_events.py", line 1026, in create_connection
infos = await self._ensure_resolved(
File "/home/lmarreiros/miniconda3/lib/python3.9/asyncio/base_events.py", line 1405, in _ensure_resolved
return await loop.getaddrinfo(host, port, family=family, type=type,
File "/home/lmarreiros/miniconda3/lib/python3.9/asyncio/base_events.py", line 861, in getaddrinfo
return await self.run_in_executor(
File "/home/lmarreiros/miniconda3/lib/python3.9/asyncio/base_events.py", line 819, in run_in_executor
executor.submit(func, args), loop=self)
File "/home/lmarreiros/miniconda3/lib/python3.9/concurrent/futures/thread.py", line 169, in submit
raise RuntimeError('cannot schedule new futures after '
RuntimeError: cannot schedule new futures after interpreter shutdown
[2023-10-03 10:36:56] [32mCheckpoint saved to /home/lmarreiros/omnia-nas/omnia/examples/drug_synergy/nni/expr_dgi_drugs_ECFP4/MultiInputModel/3n9pl067/checkpoint.[0m
[2023-10-03 10:37:00] [31mERROR: Failed to receive command. Retry in 3s[0m
Traceback (most recent call last):
File "/home/lmarreiros/.cache/pypoetry/virtualenvs/omnia-local-AEBrPPsi-py3.9/lib/python3.9/site-packages/nni/runtime/command_channel/websocket/channel.py", line 98, in _receive_command
conn = self._ensure_conn()
File "/home/lmarreiros/.cache/pypoetry/virtualenvs/omnia-local-AEBrPPsi-py3.9/lib/python3.9/site-packages/nni/runtime/command_channel/websocket/channel.py", line 75, in _ensure_conn
self._conn.connect()
File "/home/lmarreiros/.cache/pypoetry/virtualenvs/omnia-local-AEBrPPsi-py3.9/lib/python3.9/site-packages/nni/runtime/command_channel/websocket/connection.py", line 65, in connect
self._ws = _wait(_connect_async(self._url))
File "/home/lmarreiros/.cache/pypoetry/virtualenvs/omnia-local-AEBrPPsi-py3.9/lib/python3.9/site-packages/nni/runtime/command_channel/websocket/connection.py", line 121, in _wait
return future.result()
File "/home/lmarreiros/miniconda3/lib/python3.9/concurrent/futures/_base.py", line 446, in result
return self.get_result()
File "/home/lmarreiros/miniconda3/lib/python3.9/concurrent/futures/_base.py", line 391, in get_result
raise self._exception
File "/home/lmarreiros/.cache/pypoetry/virtualenvs/omnia-local-AEBrPPsi-py3.9/lib/python3.9/site-packages/nni/runtime/command_channel/websocket/connection.py", line 135, in _connect_async
return await websockets.connect(url, max_size=None) # type: ignore
File "/home/lmarreiros/.cache/pypoetry/virtualenvs/omnia-local-AEBrPPsi-py3.9/lib/python3.9/site-packages/websockets/legacy/client.py", line 655, in await_impl_timeout
return await self.await_impl()
File "/home/lmarreiros/.cache/pypoetry/virtualenvs/omnia-local-AEBrPPsi-py3.9/lib/python3.9/site-packages/websockets/legacy/client.py", line 659, in await_impl
_transport, _protocol = await self._create_connection()
File "/home/lmarreiros/miniconda3/lib/python3.9/asyncio/base_events.py", line 1026, in create_connection
infos = await self._ensure_resolved(
File "/home/lmarreiros/miniconda3/lib/python3.9/asyncio/base_events.py", line 1405, in _ensure_resolved
return await loop.getaddrinfo(host, port, family=family, type=type,
File "/home/lmarreiros/miniconda3/lib/python3.9/asyncio/base_events.py", line 861, in getaddrinfo
return await self.run_in_executor(
File "/home/lmarreiros/miniconda3/lib/python3.9/asyncio/base_events.py", line 819, in run_in_executor
executor.submit(func, args), loop=self)
File "/home/lmarreiros/miniconda3/lib/python3.9/concurrent/futures/thread.py", line 169, in submit
raise RuntimeError('cannot schedule new futures after '
RuntimeError: cannot schedule new futures after interpreter shutdown
[2023-10-03 10:37:13] [31mERROR: Failed to receive command. Retry in 4s[0m
Traceback (most recent call last):
File "/home/lmarreiros/.cache/pypoetry/virtualenvs/omnia-local-AEBrPPsi-py3.9/lib/python3.9/site-packages/nni/runtime/command_channel/websocket/channel.py", line 98, in _receive_command
conn = self._ensure_conn()
File "/home/lmarreiros/.cache/pypoetry/virtualenvs/omnia-local-AEBrPPsi-py3.9/lib/python3.9/site-packages/nni/runtime/command_channel/websocket/channel.py", line 75, in _ensure_conn
self._conn.connect()
File "/home/lmarreiros/.cache/pypoetry/virtualenvs/omnia-local-AEBrPPsi-py3.9/lib/python3.9/site-packages/nni/runtime/command_channel/websocket/connection.py", line 65, in connect
self._ws = _wait(_connect_async(self._url))
File "/home/lmarreiros/.cache/pypoetry/virtualenvs/omnia-local-AEBrPPsi-py3.9/lib/python3.9/site-packages/nni/runtime/command_channel/websocket/connection.py", line 121, in _wait
return future.result()
File "/home/lmarreiros/miniconda3/lib/python3.9/concurrent/futures/_base.py", line 446, in result
return self.get_result()
File "/home/lmarreiros/miniconda3/lib/python3.9/concurrent/futures/_base.py", line 391, in get_result
raise self._exception
File "/home/lmarreiros/.cache/pypoetry/virtualenvs/omnia-local-AEBrPPsi-py3.9/lib/python3.9/site-packages/nni/runtime/command_channel/websocket/connection.py", line 135, in _connect_async
return await websockets.connect(url, max_size=None) # type: ignore
File "/home/lmarreiros/.cache/pypoetry/virtualenvs/omnia-local-AEBrPPsi-py3.9/lib/python3.9/site-packages/websockets/legacy/client.py", line 655, in await_impl_timeout
return await self.await_impl()
File "/home/lmarreiros/.cache/pypoetry/virtualenvs/omnia-local-AEBrPPsi-py3.9/lib/python3.9/site-packages/websockets/legacy/client.py", line 659, in await_impl
_transport, _protocol = await self._create_connection()
File "/home/lmarreiros/miniconda3/lib/python3.9/asyncio/base_events.py", line 1026, in create_connection
infos = await self._ensure_resolved(
File "/home/lmarreiros/miniconda3/lib/python3.9/asyncio/base_events.py", line 1405, in _ensure_resolved
return await loop.getaddrinfo(host, port, family=family, type=type,
File "/home/lmarreiros/miniconda3/lib/python3.9/asyncio/base_events.py", line 861, in getaddrinfo
return await self.run_in_executor(
File "/home/lmarreiros/miniconda3/lib/python3.9/asyncio/base_events.py", line 819, in run_in_executor
executor.submit(func, args), loop=self)
File "/home/lmarreiros/miniconda3/lib/python3.9/concurrent/futures/thread.py", line 169, in submit
raise RuntimeError('cannot schedule new futures after '
RuntimeError: cannot schedule new futures after interpreter shutdown
[2023-10-03 10:37:25] [33mWARNING: Failed to receive command. Last retry[0m
[2023-10-03 10:37:40] [32mExperiment stopped[0m
Environment:
NNI version: 3.0rc1
Training service (local|remote|pai|aml|etc): local
Describe the issue: I am running multiple NNI experiments on my university's server at the same time (7 experiments, each using one GPU, for 7 days). Every experiment failed at about the same time with the same error. Any idea what might have caused this?
[2023-10-03 10:33:22] [31mERROR: Strategy failed to execute.[0m [2023-10-03 10:35:40] [31mERROR: Failed to receive command. Retry in 0s[0m Traceback (most recent call last): File "/home/lmarreiros/.cache/pypoetry/virtualenvs/omnia-local-AEBrPPsi-py3.9/lib/python3.9/site-packages/websockets/legacy/protocol.py", line 959, in transfer_data message = await self.read_message() File "/home/lmarreiros/.cache/pypoetry/virtualenvs/omnia-local-AEBrPPsi-py3.9/lib/python3.9/site-packages/websockets/legacy/protocol.py", line 1029, in read_message frame = await self.read_data_frame(max_size=self.max_size) File "/home/lmarreiros/.cache/pypoetry/virtualenvs/omnia-local-AEBrPPsi-py3.9/lib/python3.9/site-packages/websockets/legacy/protocol.py", line 1104, in read_data_frame frame = await self.read_frame(max_size) File "/home/lmarreiros/.cache/pypoetry/virtualenvs/omnia-local-AEBrPPsi-py3.9/lib/python3.9/site-packages/websockets/legacy/protocol.py", line 1161, in read_frame frame = await Frame.read( File "/home/lmarreiros/.cache/pypoetry/virtualenvs/omnia-local-AEBrPPsi-py3.9/lib/python3.9/site-packages/websockets/legacy/framing.py", line 68, in read data = await reader(2) File "/home/lmarreiros/miniconda3/lib/python3.9/asyncio/streams.py", line 723, in readexactly await self._wait_for_data('readexactly') File "/home/lmarreiros/miniconda3/lib/python3.9/asyncio/streams.py", line 517, in _wait_for_data await self._waiter asyncio.exceptions.CancelledError
The above exception was the direct cause of the following exception:
Traceback (most recent call last): File "/home/lmarreiros/.cache/pypoetry/virtualenvs/omnia-local-AEBrPPsi-py3.9/lib/python3.9/site-packages/nni/runtime/command_channel/websocket/channel.py", line 99, in _receive_command command = conn.receive() File "/home/lmarreiros/.cache/pypoetry/virtualenvs/omnia-local-AEBrPPsi-py3.9/lib/python3.9/site-packages/nni/runtime/command_channel/websocket/connection.py", line 103, in receive msg = _wait(self._ws.recv()) File "/home/lmarreiros/.cache/pypoetry/virtualenvs/omnia-local-AEBrPPsi-py3.9/lib/python3.9/site-packages/nni/runtime/command_channel/websocket/connection.py", line 121, in _wait return future.result() File "/home/lmarreiros/miniconda3/lib/python3.9/concurrent/futures/_base.py", line 446, in result return self.get_result() File "/home/lmarreiros/miniconda3/lib/python3.9/concurrent/futures/_base.py", line 391, in get_result raise self._exception File "/home/lmarreiros/.cache/pypoetry/virtualenvs/omnia-local-AEBrPPsi-py3.9/lib/python3.9/site-packages/websockets/legacy/protocol.py", line 568, in recv await self.ensure_open() File "/home/lmarreiros/.cache/pypoetry/virtualenvs/omnia-local-AEBrPPsi-py3.9/lib/python3.9/site-packages/websockets/legacy/protocol.py", line 944, in ensure_open raise self.connection_closed_exc() websockets.exceptions.ConnectionClosedError: sent 1011 (unexpected error) keepalive ping timeout; no close frame received [2023-10-03 10:36:15] [32mStopping experiment, please wait...[0m [2023-10-03 10:36:17] [31mERROR: Failed to receive command. Retry in 1s[0m Traceback (most recent call last): File "/home/lmarreiros/.cache/pypoetry/virtualenvs/omnia-local-AEBrPPsi-py3.9/lib/python3.9/site-packages/nni/runtime/command_channel/websocket/channel.py", line 98, in _receive_command conn = self._ensure_conn() File "/home/lmarreiros/.cache/pypoetry/virtualenvs/omnia-local-AEBrPPsi-py3.9/lib/python3.9/site-packages/nni/runtime/command_channel/websocket/channel.py", line 75, in _ensure_conn self._conn.connect() File "/home/lmarreiros/.cache/pypoetry/virtualenvs/omnia-local-AEBrPPsi-py3.9/lib/python3.9/site-packages/nni/runtime/command_channel/websocket/connection.py", line 65, in connect self._ws = _wait(_connect_async(self._url)) File "/home/lmarreiros/.cache/pypoetry/virtualenvs/omnia-local-AEBrPPsi-py3.9/lib/python3.9/site-packages/nni/runtime/command_channel/websocket/connection.py", line 121, in _wait return future.result() File "/home/lmarreiros/miniconda3/lib/python3.9/concurrent/futures/_base.py", line 446, in result return self.get_result() File "/home/lmarreiros/miniconda3/lib/python3.9/concurrent/futures/_base.py", line 391, in get_result raise self._exception File "/home/lmarreiros/.cache/pypoetry/virtualenvs/omnia-local-AEBrPPsi-py3.9/lib/python3.9/site-packages/nni/runtime/command_channel/websocket/connection.py", line 135, in _connect_async return await websockets.connect(url, max_size=None) # type: ignore File "/home/lmarreiros/.cache/pypoetry/virtualenvs/omnia-local-AEBrPPsi-py3.9/lib/python3.9/site-packages/websockets/legacy/client.py", line 655, in await_impl_timeout return await self.await_impl() File "/home/lmarreiros/.cache/pypoetry/virtualenvs/omnia-local-AEBrPPsi-py3.9/lib/python3.9/site-packages/websockets/legacy/client.py", line 659, in await_impl _transport, _protocol = await self._create_connection() File "/home/lmarreiros/miniconda3/lib/python3.9/asyncio/base_events.py", line 1026, in create_connection infos = await self._ensure_resolved( File "/home/lmarreiros/miniconda3/lib/python3.9/asyncio/base_events.py", line 1405, in _ensure_resolved return await loop.getaddrinfo(host, port, family=family, type=type, File "/home/lmarreiros/miniconda3/lib/python3.9/asyncio/base_events.py", line 861, in getaddrinfo return await self.run_in_executor( File "/home/lmarreiros/miniconda3/lib/python3.9/asyncio/base_events.py", line 819, in run_in_executor executor.submit(func, args), loop=self) File "/home/lmarreiros/miniconda3/lib/python3.9/concurrent/futures/thread.py", line 169, in submit raise RuntimeError('cannot schedule new futures after ' RuntimeError: cannot schedule new futures after interpreter shutdown [2023-10-03 10:36:49] [31mERROR: Failed to receive command. Retry in 2s[0m Traceback (most recent call last): File "/home/lmarreiros/.cache/pypoetry/virtualenvs/omnia-local-AEBrPPsi-py3.9/lib/python3.9/site-packages/nni/runtime/command_channel/websocket/channel.py", line 98, in _receive_command conn = self._ensure_conn() File "/home/lmarreiros/.cache/pypoetry/virtualenvs/omnia-local-AEBrPPsi-py3.9/lib/python3.9/site-packages/nni/runtime/command_channel/websocket/channel.py", line 75, in _ensure_conn self._conn.connect() File "/home/lmarreiros/.cache/pypoetry/virtualenvs/omnia-local-AEBrPPsi-py3.9/lib/python3.9/site-packages/nni/runtime/command_channel/websocket/connection.py", line 65, in connect self._ws = _wait(_connect_async(self._url)) File "/home/lmarreiros/.cache/pypoetry/virtualenvs/omnia-local-AEBrPPsi-py3.9/lib/python3.9/site-packages/nni/runtime/command_channel/websocket/connection.py", line 121, in _wait return future.result() File "/home/lmarreiros/miniconda3/lib/python3.9/concurrent/futures/_base.py", line 446, in result return self.get_result() File "/home/lmarreiros/miniconda3/lib/python3.9/concurrent/futures/_base.py", line 391, in get_result raise self._exception File "/home/lmarreiros/.cache/pypoetry/virtualenvs/omnia-local-AEBrPPsi-py3.9/lib/python3.9/site-packages/nni/runtime/command_channel/websocket/connection.py", line 135, in _connect_async return await websockets.connect(url, max_size=None) # type: ignore File "/home/lmarreiros/.cache/pypoetry/virtualenvs/omnia-local-AEBrPPsi-py3.9/lib/python3.9/site-packages/websockets/legacy/client.py", line 655, in await_impl_timeout return await self.await_impl() File "/home/lmarreiros/.cache/pypoetry/virtualenvs/omnia-local-AEBrPPsi-py3.9/lib/python3.9/site-packages/websockets/legacy/client.py", line 659, in await_impl _transport, _protocol = await self._create_connection() File "/home/lmarreiros/miniconda3/lib/python3.9/asyncio/base_events.py", line 1026, in create_connection infos = await self._ensure_resolved( File "/home/lmarreiros/miniconda3/lib/python3.9/asyncio/base_events.py", line 1405, in _ensure_resolved return await loop.getaddrinfo(host, port, family=family, type=type, File "/home/lmarreiros/miniconda3/lib/python3.9/asyncio/base_events.py", line 861, in getaddrinfo return await self.run_in_executor( File "/home/lmarreiros/miniconda3/lib/python3.9/asyncio/base_events.py", line 819, in run_in_executor executor.submit(func, args), loop=self) File "/home/lmarreiros/miniconda3/lib/python3.9/concurrent/futures/thread.py", line 169, in submit raise RuntimeError('cannot schedule new futures after ' RuntimeError: cannot schedule new futures after interpreter shutdown [2023-10-03 10:36:56] [32mCheckpoint saved to /home/lmarreiros/omnia-nas/omnia/examples/drug_synergy/nni/expr_dgi_drugs_ECFP4/MultiInputModel/3n9pl067/checkpoint.[0m [2023-10-03 10:37:00] [31mERROR: Failed to receive command. Retry in 3s[0m Traceback (most recent call last): File "/home/lmarreiros/.cache/pypoetry/virtualenvs/omnia-local-AEBrPPsi-py3.9/lib/python3.9/site-packages/nni/runtime/command_channel/websocket/channel.py", line 98, in _receive_command conn = self._ensure_conn() File "/home/lmarreiros/.cache/pypoetry/virtualenvs/omnia-local-AEBrPPsi-py3.9/lib/python3.9/site-packages/nni/runtime/command_channel/websocket/channel.py", line 75, in _ensure_conn self._conn.connect() File "/home/lmarreiros/.cache/pypoetry/virtualenvs/omnia-local-AEBrPPsi-py3.9/lib/python3.9/site-packages/nni/runtime/command_channel/websocket/connection.py", line 65, in connect self._ws = _wait(_connect_async(self._url)) File "/home/lmarreiros/.cache/pypoetry/virtualenvs/omnia-local-AEBrPPsi-py3.9/lib/python3.9/site-packages/nni/runtime/command_channel/websocket/connection.py", line 121, in _wait return future.result() File "/home/lmarreiros/miniconda3/lib/python3.9/concurrent/futures/_base.py", line 446, in result return self.get_result() File "/home/lmarreiros/miniconda3/lib/python3.9/concurrent/futures/_base.py", line 391, in get_result raise self._exception File "/home/lmarreiros/.cache/pypoetry/virtualenvs/omnia-local-AEBrPPsi-py3.9/lib/python3.9/site-packages/nni/runtime/command_channel/websocket/connection.py", line 135, in _connect_async return await websockets.connect(url, max_size=None) # type: ignore File "/home/lmarreiros/.cache/pypoetry/virtualenvs/omnia-local-AEBrPPsi-py3.9/lib/python3.9/site-packages/websockets/legacy/client.py", line 655, in await_impl_timeout return await self.await_impl() File "/home/lmarreiros/.cache/pypoetry/virtualenvs/omnia-local-AEBrPPsi-py3.9/lib/python3.9/site-packages/websockets/legacy/client.py", line 659, in await_impl _transport, _protocol = await self._create_connection() File "/home/lmarreiros/miniconda3/lib/python3.9/asyncio/base_events.py", line 1026, in create_connection infos = await self._ensure_resolved( File "/home/lmarreiros/miniconda3/lib/python3.9/asyncio/base_events.py", line 1405, in _ensure_resolved return await loop.getaddrinfo(host, port, family=family, type=type, File "/home/lmarreiros/miniconda3/lib/python3.9/asyncio/base_events.py", line 861, in getaddrinfo return await self.run_in_executor( File "/home/lmarreiros/miniconda3/lib/python3.9/asyncio/base_events.py", line 819, in run_in_executor executor.submit(func, args), loop=self) File "/home/lmarreiros/miniconda3/lib/python3.9/concurrent/futures/thread.py", line 169, in submit raise RuntimeError('cannot schedule new futures after ' RuntimeError: cannot schedule new futures after interpreter shutdown [2023-10-03 10:37:13] [31mERROR: Failed to receive command. Retry in 4s[0m Traceback (most recent call last): File "/home/lmarreiros/.cache/pypoetry/virtualenvs/omnia-local-AEBrPPsi-py3.9/lib/python3.9/site-packages/nni/runtime/command_channel/websocket/channel.py", line 98, in _receive_command conn = self._ensure_conn() File "/home/lmarreiros/.cache/pypoetry/virtualenvs/omnia-local-AEBrPPsi-py3.9/lib/python3.9/site-packages/nni/runtime/command_channel/websocket/channel.py", line 75, in _ensure_conn self._conn.connect() File "/home/lmarreiros/.cache/pypoetry/virtualenvs/omnia-local-AEBrPPsi-py3.9/lib/python3.9/site-packages/nni/runtime/command_channel/websocket/connection.py", line 65, in connect self._ws = _wait(_connect_async(self._url)) File "/home/lmarreiros/.cache/pypoetry/virtualenvs/omnia-local-AEBrPPsi-py3.9/lib/python3.9/site-packages/nni/runtime/command_channel/websocket/connection.py", line 121, in _wait return future.result() File "/home/lmarreiros/miniconda3/lib/python3.9/concurrent/futures/_base.py", line 446, in result return self.get_result() File "/home/lmarreiros/miniconda3/lib/python3.9/concurrent/futures/_base.py", line 391, in get_result raise self._exception File "/home/lmarreiros/.cache/pypoetry/virtualenvs/omnia-local-AEBrPPsi-py3.9/lib/python3.9/site-packages/nni/runtime/command_channel/websocket/connection.py", line 135, in _connect_async return await websockets.connect(url, max_size=None) # type: ignore File "/home/lmarreiros/.cache/pypoetry/virtualenvs/omnia-local-AEBrPPsi-py3.9/lib/python3.9/site-packages/websockets/legacy/client.py", line 655, in await_impl_timeout return await self.await_impl() File "/home/lmarreiros/.cache/pypoetry/virtualenvs/omnia-local-AEBrPPsi-py3.9/lib/python3.9/site-packages/websockets/legacy/client.py", line 659, in await_impl _transport, _protocol = await self._create_connection() File "/home/lmarreiros/miniconda3/lib/python3.9/asyncio/base_events.py", line 1026, in create_connection infos = await self._ensure_resolved( File "/home/lmarreiros/miniconda3/lib/python3.9/asyncio/base_events.py", line 1405, in _ensure_resolved return await loop.getaddrinfo(host, port, family=family, type=type, File "/home/lmarreiros/miniconda3/lib/python3.9/asyncio/base_events.py", line 861, in getaddrinfo return await self.run_in_executor( File "/home/lmarreiros/miniconda3/lib/python3.9/asyncio/base_events.py", line 819, in run_in_executor executor.submit(func, args), loop=self) File "/home/lmarreiros/miniconda3/lib/python3.9/concurrent/futures/thread.py", line 169, in submit raise RuntimeError('cannot schedule new futures after ' RuntimeError: cannot schedule new futures after interpreter shutdown [2023-10-03 10:37:25] [33mWARNING: Failed to receive command. Last retry[0m [2023-10-03 10:37:40] [32mExperiment stopped[0m
Environment:
Log message: