I am using SMARTS 1.2.0 to train a model. After training for ~12 hours, SMARTS will always freeze. I talked to SMARTS team and they believe that it is related to SumoTrafficSimulation._cumulative_sim_seconds. This variable is probably not reset and grows very large over time. Below is the error message when I terminate the training:
^C^CTraceback (most recent call last):
File "/home/edward/anaconda3/envs/smarts/lib/python3.8/site-packages/smarts/core/smarts.py", line 260, in step
return self._step(agent_actions, time_delta_since_last_step)
File "/home/edward/anaconda3/envs/smarts/lib/python3.8/site-packages/smarts/core/smarts.py", line 317, in _step
provider_state = self._step_providers(all_agent_actions)
File "/home/edward/anaconda3/envs/smarts/lib/python3.8/site-packages/smarts/core/smarts.py", line 1334, in _step_providers
provider_state = provider.step(
File "/home/edward/anaconda3/envs/smarts/lib/python3.8/site-packages/smarts/core/sumo_traffic_simulation.py", line 471, in step
self._last_provider_state = self._step(dt)
File "/home/edward/anaconda3/envs/smarts/lib/python3.8/site-packages/smarts/core/sumo_traffic_simulation.py", line 482, in _step
self._traci_conn.simulationStep(self._cumulative_sim_seconds)
File "/home/edward/anaconda3/envs/smarts/lib/python3.8/site-packages/smarts/core/utils/sumo.py", line 236, in _wrap_traci_method
return method(*args, **kwargs)
File "/home/edward/anaconda3/envs/smarts/lib/python3.8/site-packages/sumo/tools/traci/connection.py", line 366, in simulationStep
result = self._sendCmd(tc.CMD_SIMSTEP, None, None, "D", step)
File "/home/edward/anaconda3/envs/smarts/lib/python3.8/site-packages/sumo/tools/traci/connection.py", line 228, in _sendCmd
return self._sendExact()
File "/home/edward/anaconda3/envs/smarts/lib/python3.8/site-packages/sumo/tools/traci/connection.py", line 131, in _sendExact
result = self._recvExact()
File "/home/edward/anaconda3/envs/smarts/lib/python3.8/site-packages/sumo/tools/traci/connection.py", line 109, in _recvExact
t = self._socket.recv(4 - len(result))
KeyboardInterrupt
Version
I used v1.2.0
Steps to reproduce the bug
If the current judgement is right and the bug is caused by self._cumulative_sim_seconds. Running the SMARTS for many episodes for a long time will reproduce the bug.
Running an experiment with a high average number of steps and a single map with more than one traffic variation is guaranteed to cause this issue.
System info
System info:
Ubuntu 20.04
Python 3.8
Date:
2023-09-26
Error logs and screenshots
^C^CTraceback (most recent call last):
File "/home/edward/anaconda3/envs/smarts/lib/python3.8/site-packages/smarts/core/smarts.py", line 260, in step
return self._step(agent_actions, time_delta_since_last_step)
File "/home/edward/anaconda3/envs/smarts/lib/python3.8/site-packages/smarts/core/smarts.py", line 317, in _step
provider_state = self._step_providers(all_agent_actions)
File "/home/edward/anaconda3/envs/smarts/lib/python3.8/site-packages/smarts/core/smarts.py", line 1334, in _step_providers
provider_state = provider.step(
File "/home/edward/anaconda3/envs/smarts/lib/python3.8/site-packages/smarts/core/sumo_traffic_simulation.py", line 471, in step
self._last_provider_state = self._step(dt)
File "/home/edward/anaconda3/envs/smarts/lib/python3.8/site-packages/smarts/core/sumo_traffic_simulation.py", line 482, in _step
self._traci_conn.simulationStep(self._cumulative_sim_seconds)
File "/home/edward/anaconda3/envs/smarts/lib/python3.8/site-packages/smarts/core/utils/sumo.py", line 236, in _wrap_traci_method
return method(*args, **kwargs)
File "/home/edward/anaconda3/envs/smarts/lib/python3.8/site-packages/sumo/tools/traci/connection.py", line 366, in simulationStep
result = self._sendCmd(tc.CMD_SIMSTEP, None, None, "D", step)
File "/home/edward/anaconda3/envs/smarts/lib/python3.8/site-packages/sumo/tools/traci/connection.py", line 228, in _sendCmd
return self._sendExact()
File "/home/edward/anaconda3/envs/smarts/lib/python3.8/site-packages/sumo/tools/traci/connection.py", line 131, in _sendExact
result = self._recvExact()
File "/home/edward/anaconda3/envs/smarts/lib/python3.8/site-packages/sumo/tools/traci/connection.py", line 109, in _recvExact
t = self._socket.recv(4 - len(result))
KeyboardInterrupt
Impact (If known)
This bug will hinder training large models with SMARTS.
High Level Description
I am using SMARTS 1.2.0 to train a model. After training for ~12 hours, SMARTS will always freeze. I talked to SMARTS team and they believe that it is related to
SumoTrafficSimulation._cumulative_sim_seconds
. This variable is probably not reset and grows very large over time. Below is the error message when I terminate the training:^C^CTraceback (most recent call last): File "/home/edward/anaconda3/envs/smarts/lib/python3.8/site-packages/smarts/core/smarts.py", line 260, in step return self._step(agent_actions, time_delta_since_last_step) File "/home/edward/anaconda3/envs/smarts/lib/python3.8/site-packages/smarts/core/smarts.py", line 317, in _step provider_state = self._step_providers(all_agent_actions) File "/home/edward/anaconda3/envs/smarts/lib/python3.8/site-packages/smarts/core/smarts.py", line 1334, in _step_providers provider_state = provider.step( File "/home/edward/anaconda3/envs/smarts/lib/python3.8/site-packages/smarts/core/sumo_traffic_simulation.py", line 471, in step self._last_provider_state = self._step(dt) File "/home/edward/anaconda3/envs/smarts/lib/python3.8/site-packages/smarts/core/sumo_traffic_simulation.py", line 482, in _step self._traci_conn.simulationStep(self._cumulative_sim_seconds) File "/home/edward/anaconda3/envs/smarts/lib/python3.8/site-packages/smarts/core/utils/sumo.py", line 236, in _wrap_traci_method return method(*args, **kwargs) File "/home/edward/anaconda3/envs/smarts/lib/python3.8/site-packages/sumo/tools/traci/connection.py", line 366, in simulationStep result = self._sendCmd(tc.CMD_SIMSTEP, None, None, "D", step) File "/home/edward/anaconda3/envs/smarts/lib/python3.8/site-packages/sumo/tools/traci/connection.py", line 228, in _sendCmd return self._sendExact() File "/home/edward/anaconda3/envs/smarts/lib/python3.8/site-packages/sumo/tools/traci/connection.py", line 131, in _sendExact result = self._recvExact() File "/home/edward/anaconda3/envs/smarts/lib/python3.8/site-packages/sumo/tools/traci/connection.py", line 109, in _recvExact t = self._socket.recv(4 - len(result)) KeyboardInterrupt
Version
I used v1.2.0
Steps to reproduce the bug
System info
System info: Ubuntu 20.04 Python 3.8
Date: 2023-09-26
Error logs and screenshots
^C^CTraceback (most recent call last): File "/home/edward/anaconda3/envs/smarts/lib/python3.8/site-packages/smarts/core/smarts.py", line 260, in step return self._step(agent_actions, time_delta_since_last_step) File "/home/edward/anaconda3/envs/smarts/lib/python3.8/site-packages/smarts/core/smarts.py", line 317, in _step provider_state = self._step_providers(all_agent_actions) File "/home/edward/anaconda3/envs/smarts/lib/python3.8/site-packages/smarts/core/smarts.py", line 1334, in _step_providers provider_state = provider.step( File "/home/edward/anaconda3/envs/smarts/lib/python3.8/site-packages/smarts/core/sumo_traffic_simulation.py", line 471, in step self._last_provider_state = self._step(dt) File "/home/edward/anaconda3/envs/smarts/lib/python3.8/site-packages/smarts/core/sumo_traffic_simulation.py", line 482, in _step self._traci_conn.simulationStep(self._cumulative_sim_seconds) File "/home/edward/anaconda3/envs/smarts/lib/python3.8/site-packages/smarts/core/utils/sumo.py", line 236, in _wrap_traci_method return method(*args, **kwargs) File "/home/edward/anaconda3/envs/smarts/lib/python3.8/site-packages/sumo/tools/traci/connection.py", line 366, in simulationStep result = self._sendCmd(tc.CMD_SIMSTEP, None, None, "D", step) File "/home/edward/anaconda3/envs/smarts/lib/python3.8/site-packages/sumo/tools/traci/connection.py", line 228, in _sendCmd return self._sendExact() File "/home/edward/anaconda3/envs/smarts/lib/python3.8/site-packages/sumo/tools/traci/connection.py", line 131, in _sendExact result = self._recvExact() File "/home/edward/anaconda3/envs/smarts/lib/python3.8/site-packages/sumo/tools/traci/connection.py", line 109, in _recvExact t = self._socket.recv(4 - len(result)) KeyboardInterrupt
Impact (If known)
This bug will hinder training large models with SMARTS.