run python run_lab.py slm_lab/spec/benchmark/ppo/ppo_pong.json ppo_pong train
Additional context
Only 4 CPU cores are used and they are running on 100%. It seems to use all four of the GPUs (RTX 3090).
Error logs
Replaced the start of the paths with ...
Process Process-5:
Traceback (most recent call last):
File ".../envs/lab/lib/python3.7/multiprocessing/process.py", line 297, in _bootstrap
self.run()
File ".../envs/lab/lib/python3.7/multiprocessing/process.py", line 99, in run
self._target(*self._args, **self._kwargs)
File ".../SLM-Lab/slm_lab/experiment/control.py", line 26, in mp_run_session
metrics = session.run()
File ".../SLM-Lab/slm_lab/experiment/control.py", line 118, in run
self.run_rl()
File ".../SLM-Lab/slm_lab/experiment/control.py", line 104, in run_rl
action = self.agent.act(state)
File ".../slm_lab/agent/__init__.py", line 43, in act
action = self.algorithm.act(state)
File ".../SLM-Lab/slm_lab/agent/algorithm/reinforce.py", line 102, in act
action = self.action_policy(state, self, body)
File ".../SLM-Lab/slm_lab/agent/algorithm/policy_util.py", line 137, in default
pdparam = calc_pdparam(state, algorithm, body)
File ".../SLM-Lab/slm_lab/agent/algorithm/policy_util.py", line 85, in calc_pdparam
pdparam = algorithm.calc_pdparam(state)
File ".../SLM-Lab/slm_lab/agent/algorithm/actor_critic.py", line 172, in calc_pdparam
out = super().calc_pdparam(x, net=net)
File ".../SLM-Lab/slm_lab/agent/algorithm/reinforce.py", line 96, in calc_pdparam
pdparam = net(x)
File ".../envs/lab/lib/python3.7/site-packages/torch/nn/modules/module.py", line 541, in __call__
result = self.forward(*input, **kwargs)
File ".../SLM-Lab/slm_lab/agent/net/conv.py", line 176, in forward
x = self.conv_model(x)
File ".../envs/lab/lib/python3.7/site-packages/torch/nn/modules/module.py", line 541, in __call__
result = self.forward(*input, **kwargs)
File ".../envs/lab/lib/python3.7/site-packages/torch/nn/modules/container.py", line 92, in forward
input = module(input)
File ".../envs/lab/lib/python3.7/site-packages/torch/nn/modules/module.py", line 541, in __call__
result = self.forward(*input, **kwargs)
File ".../envs/lab/lib/python3.7/site-packages/torch/nn/modules/conv.py", line 345, in forward
return self.conv2d_forward(input, self.weight)
File ".../envs/lab/lib/python3.7/site-packages/torch/nn/modules/conv.py", line 342, in conv2d_forward
self.padding, self.dilation, self.groups)
RuntimeError: cuDNN error: CUDNN_STATUS_EXECUTION_FAILED
Process Process-3:
Traceback (most recent call last):
File ".../envs/lab/lib/python3.7/multiprocessing/process.py", line 297, in _bootstrap
self.run()
File ".../envs/lab/lib/python3.7/multiprocessing/process.py", line 99, in run
self._target(*self._args, **self._kwargs)
File ".../SLM-Lab/slm_lab/experiment/control.py", line 26, in mp_run_session
metrics = session.run()
File ".../SLM-Lab/slm_lab/experiment/control.py", line 118, in run
self.run_rl()
File ".../SLM-Lab/slm_lab/experiment/control.py", line 104, in run_rl
action = self.agent.act(state)
File ".../SLM-Lab/slm_lab/agent/__init__.py", line 43, in act
action = self.algorithm.act(state)
File ".../SLM-Lab/slm_lab/agent/algorithm/reinforce.py", line 102, in act
action = self.action_policy(state, self, body)
File ".../SLM-Lab/slm_lab/agent/algorithm/policy_util.py", line 137, in default
pdparam = calc_pdparam(state, algorithm, body)
File ".../SLM-Lab/slm_lab/agent/algorithm/policy_util.py", line 85, in calc_pdparam
pdparam = algorithm.calc_pdparam(state)
File ".../SLM-Lab/slm_lab/agent/algorithm/actor_critic.py", line 172, in calc_pdparam
out = super().calc_pdparam(x, net=net)
File ".../SLM-Lab/slm_lab/agent/algorithm/reinforce.py", line 96, in calc_pdparam
pdparam = net(x)
File ".../envs/lab/lib/python3.7/site-packages/torch/nn/modules/module.py", line 541, in __call__
result = self.forward(*input, **kwargs)
File ".../SLM-Lab/slm_lab/agent/net/conv.py", line 176, in forward
x = self.conv_model(x)
File ".../envs/lab/lib/python3.7/site-packages/torch/nn/modules/module.py", line 541, in __call__
result = self.forward(*input, **kwargs)
File ".../envs/lab/lib/python3.7/site-packages/torch/nn/modules/container.py", line 92, in forward
input = module(input)
File ".../envs/lab/lib/python3.7/site-packages/torch/nn/modules/module.py", line 541, in __call__
result = self.forward(*input, **kwargs)
File ".../envs/lab/lib/python3.7/site-packages/torch/nn/modules/conv.py", line 345, in forward
return self.conv2d_forward(input, self.weight)
File ".../envs/lab/lib/python3.7/site-packages/torch/nn/modules/conv.py", line 342, in conv2d_forward
self.padding, self.dilation, self.groups)
RuntimeError: cuDNN error: CUDNN_STATUS_EXECUTION_FAILED
Process Process-4:
Traceback (most recent call last):
File ".../envs/lab/lib/python3.7/multiprocessing/process.py", line 297, in _bootstrap
self.run()
File ".../envs/lab/lib/python3.7/multiprocessing/process.py", line 99, in run
self._target(*self._args, **self._kwargs)
File ".../SLM-Lab/slm_lab/experiment/control.py", line 26, in mp_run_session
metrics = session.run()
File ".../SLM-Lab/slm_lab/experiment/control.py", line 118, in run
self.run_rl()
File ".../SLM-Lab/slm_lab/experiment/control.py", line 104, in run_rl
action = self.agent.act(state)
File ".../SLM-Lab/slm_lab/agent/__init__.py", line 43, in act
action = self.algorithm.act(state)
File ".../SLM-Lab/slm_lab/agent/algorithm/reinforce.py", line 102, in act
action = self.action_policy(state, self, body)
File ".../SLM-Lab/slm_lab/agent/algorithm/policy_util.py", line 137, in default
pdparam = calc_pdparam(state, algorithm, body)
File ".../SLM-Lab/slm_lab/agent/algorithm/policy_util.py", line 85, in calc_pdparam
pdparam = algorithm.calc_pdparam(state)
File ".../SLM-Lab/slm_lab/agent/algorithm/actor_critic.py", line 172, in calc_pdparam
out = super().calc_pdparam(x, net=net)
File ".../SLM-Lab/slm_lab/agent/algorithm/reinforce.py", line 96, in calc_pdparam
pdparam = net(x)
File ".../envs/lab/lib/python3.7/site-packages/torch/nn/modules/module.py", line 541, in __call__
result = self.forward(*input, **kwargs)
File ".../SLM-Lab/slm_lab/agent/net/conv.py", line 176, in forward
x = self.conv_model(x)
File ".../envs/lab/lib/python3.7/site-packages/torch/nn/modules/module.py", line 541, in __call__
result = self.forward(*input, **kwargs)
File ".../envs/lab/lib/python3.7/site-packages/torch/nn/modules/container.py", line 92, in forward
input = module(input)
File ".../envs/lab/lib/python3.7/site-packages/torch/nn/modules/module.py", line 541, in __call__
result = self.forward(*input, **kwargs)
File ".../envs/lab/lib/python3.7/site-packages/torch/nn/modules/conv.py", line 345, in forward
return self.conv2d_forward(input, self.weight)
File ".../envs/lab/lib/python3.7/site-packages/torch/nn/modules/conv.py", line 342, in conv2d_forward
self.padding, self.dilation, self.groups)
RuntimeError: cuDNN error: CUDNN_STATUS_EXECUTION_FAILED
Traceback (most recent call last):
File "run_lab.py", line 99, in <module>
main()
File "run_lab.py", line 91, in main
get_spec_and_run(*args)
File "run_lab.py", line 75, in get_spec_and_run
run_spec(spec, lab_mode)
File "run_lab.py", line 58, in run_spec
Trial(spec).run()
File ".../SLM-Lab/slm_lab/experiment/control.py", line 182, in run
metrics = analysis.analyze_trial(self.spec, session_metrics_list)
File ".../SLM-Lab/slm_lab/experiment/analysis.py", line 266, in analyze_trial
trial_metrics = calc_trial_metrics(session_metrics_list, info_prepath)
File ".../SLM-Lab/slm_lab/experiment/analysis.py", line 186, in calc_trial_metrics
frames = session_metrics_list[0]['local']['frames']
IndexError: list index out of range
Describe the bug Running the PPO on Pong example from the guides results in CUDNN_STATUS_EXECUTION_FAILED error.
To Reproduce
git rev-parse HEAD
to get it): faca82c00c51a993e1773e115d5528ffb7ad4adespec
file used: slm_lab/spec/benchmark/ppo/ppo_pong.jsonpython run_lab.py slm_lab/spec/benchmark/ppo/ppo_pong.json ppo_pong train
Additional context Only 4 CPU cores are used and they are running on 100%. It seems to use all four of the GPUs (RTX 3090).
Error logs Replaced the start of the paths with
...