When launching several jobs in parallel and each job has multiple parallel envs, it's easy to get the error below. I feel like this has something to do with CPU memory with a large subprocess number, but am not sure. It might just be a result of being killed by the server due to memory limit.
E0614 02:06:00.888332 140280418162432 grid_search.py:297] Error sending result: '<multiprocess.pool.ExceptionWithTraceback object at 0x7f956acb0a90>'. Reason: 'PicklingError("Can't pickle <class 'EOFError'>: it's not the same object as builtins.EOFError")'
I0614 02:06:01.364018 140283127826240 grid_search.py:349] Traceback (most recent call last):
File "/running_root/platform/tensorboard/sfeditor_car-20210614-020253/alf/alf/bin/grid_search.py", line 345, in _worker
train_eval(root_dir)
File "/usr/local/lib/python3.7/dist-packages/gin/config.py", line 1137, in gin_wrapper
utils.augment_exception_message_and_reraise(e, err_str)
File "/usr/local/lib/python3.7/dist-packages/gin/utils.py", line 49, in augment_exception_message_and_reraise
six.raise_from(proxy.with_traceback(exception.__traceback__), None)
File "<string>", line 3, in raise_from
File "/usr/local/lib/python3.7/dist-packages/gin/config.py", line 1114, in gin_wrapper
return fn(*new_args, **new_kwargs)
File "/running_root/platform/tensorboard/sfeditor_car-20210614-020253/alf/alf/bin/train.py", line 85, in train_eval
trainer = policy_trainer.RLTrainer(trainer_conf)
File "/running_root/platform/tensorboard/sfeditor_car-20210614-020253/alf/alf/trainers/policy_trainer.py", line 317, in __init__
debug_summaries=self._debug_summaries)
File "/usr/local/lib/python3.7/dist-packages/gin/config.py", line 1137, in gin_wrapper
utils.augment_exception_message_and_reraise(e, err_str)
File "/usr/local/lib/python3.7/dist-packages/gin/utils.py", line 49, in augment_exception_message_and_reraise
six.raise_from(proxy.with_traceback(exception.__traceback__), None)
File "<string>", line 3, in raise_from
File "/usr/local/lib/python3.7/dist-packages/gin/config.py", line 1114, in gin_wrapper
return fn(*new_args, **new_kwargs)
File "/running_root/platform/tensorboard/sfeditor_car-20210614-020253/alf/alf/config_util.py", line 529, in _wrapper
**unspecified_kw_args)
File "/running_root/platform/tensorboard/sfeditor_car-20210614-020253/alf/alf/algorithms/agent.py", line 185, in __init__
**agent_helper.state_specs())
File "/usr/local/lib/python3.7/dist-packages/gin/config.py", line 1137, in gin_wrapper
utils.augment_exception_message_and_reraise(e, err_str)
File "/usr/local/lib/python3.7/dist-packages/gin/utils.py", line 49, in augment_exception_message_and_reraise
six.raise_from(proxy.with_traceback(exception.__traceback__), None)
File "<string>", line 3, in raise_from
File "/usr/local/lib/python3.7/dist-packages/gin/config.py", line 1114, in gin_wrapper
return fn(*new_args, **new_kwargs)
File "/running_root/platform/tensorboard/sfeditor_car-20210614-020253/alf/alf/algorithms/rl_algorithm.py", line 184, in __init__
example_env_info=env.reset().env_info,
File "/running_root/platform/tensorboard/sfeditor_car-20210614-020253/alf/alf/environments/alf_environment.py", line 182, in reset
self._current_time_step = self._reset()
File "/running_root/platform/tensorboard/sfeditor_car-20210614-020253/alf/alf/environments/parallel_environment.py", line 142, in _reset
time_steps = [promise() for promise in time_steps]
File "/running_root/platform/tensorboard/sfeditor_car-20210614-020253/alf/alf/environments/parallel_environment.py", line 142, in <listcomp>
time_steps = [promise() for promise in time_steps]
File "/running_root/platform/tensorboard/sfeditor_car-20210614-020253/alf/alf/environments/process_environment.py", line 222, in _receive
message, payload = self._conn.recv()
File "/usr/lib/python3.7/multiprocessing/connection.py", line 250, in recv
buf = self._recv_bytes()
File "/usr/lib/python3.7/multiprocessing/connection.py", line 407, in _recv_bytes
buf = self._recv(4)
File "/usr/lib/python3.7/multiprocessing/connection.py", line 383, in _recv
raise EOFError
EOFError:
In call to configurable 'RLAlgorithm' (<class 'alf.algorithms.rl_algorithm.RLAlgorithm'>)
In call to configurable 'Agent' (<class 'alf.algorithms.agent.Agent'>)
In call to configurable 'train_eval' (<function train_eval at 0x7f956fd23680>)
Sometimes the error message becomes:
File "/running_root/platform/tensorboard/sac_safety_gym_car-20210613-142809/alf/alf/bin/grid_search.py", line 345, in _worker
train_eval(root_dir)
File "/usr/local/lib/python3.7/dist-packages/gin/config.py", line 1137, in gin_wrapper
utils.augment_exception_message_and_reraise(e, err_str)
File "/usr/local/lib/python3.7/dist-packages/gin/utils.py", line 49, in augment_exception_message_and_reraise
six.raise_from(proxy.with_traceback(exception.__traceback__), None)
File "<string>", line 3, in raise_from
File "/usr/local/lib/python3.7/dist-packages/gin/config.py", line 1114, in gin_wrapper
return fn(*new_args, **new_kwargs)
File "/running_root/platform/tensorboard/sac_safety_gym_car-20210613-142809/alf/alf/bin/train.py", line 91, in train_eval
trainer.train()
File "/running_root/platform/tensorboard/sac_safety_gym_car-20210613-142809/alf/alf/trainers/policy_trainer.py", line 153, in train
summary_max_queue=self._summary_max_queue)
File "/running_root/platform/tensorboard/sac_safety_gym_car-20210613-142809/alf/alf/utils/common.py", line 271, in run_under_record_context
func()
File "/running_root/platform/tensorboard/sac_safety_gym_car-20210613-142809/alf/alf/trainers/policy_trainer.py", line 398, in _train
train_steps = self._algorithm.train_iter()
File "/running_root/platform/tensorboard/sac_safety_gym_car-20210613-142809/alf/alf/algorithms/rl_algorithm.py", line 485, in train_iter
return self._train_iter_off_policy()
File "/running_root/platform/tensorboard/sac_safety_gym_car-20210613-142809/alf/alf/algorithms/rl_algorithm.py", line 515, in _train_iter_off_policy
experience = self.unroll(config.unroll_length)
File "/running_root/platform/tensorboard/sac_safety_gym_car-20210613-142809/alf/alf/utils/common.py", line 965, in _func
ret = func(*args, **kwargs)
File "/running_root/platform/tensorboard/sac_safety_gym_car-20210613-142809/alf/alf/algorithms/rl_algorithm.py", line 419, in unroll
next_time_step = self._env.step(action)
File "/running_root/platform/tensorboard/sac_safety_gym_car-20210613-142809/alf/alf/environments/alf_environment.py", line 209, in step
self._current_time_step = self._step(action)
File "/running_root/platform/tensorboard/sac_safety_gym_car-20210613-142809/alf/alf/environments/parallel_environment.py", line 159, in _step
for env, action in zip(self._envs, self._unstack_actions(actions))
File "/running_root/platform/tensorboard/sac_safety_gym_car-20210613-142809/alf/alf/environments/parallel_environment.py", line 159, in <listcomp>
for env, action in zip(self._envs, self._unstack_actions(actions))
File "/running_root/platform/tensorboard/sac_safety_gym_car-20210613-142809/alf/alf/environments/process_environment.py", line 190, in step
promise = self.call('step', action)
File "/running_root/platform/tensorboard/sac_safety_gym_car-20210613-142809/alf/alf/environments/process_environment.py", line 167, in call
self._conn.send((self._CALL, payload))
File "/usr/lib/python3.7/multiprocessing/connection.py", line 206, in send
self._send_bytes(_ForkingPickler.dumps(obj))
File "/usr/lib/python3.7/multiprocessing/connection.py", line 404, in _send_bytes
self._send(header + buf)
File "/usr/lib/python3.7/multiprocessing/connection.py", line 368, in _send
n = write(self._handle, buf)
BrokenPipeError: [Errno 32] Broken pipe
In call to configurable 'train_eval' (<function train_eval at 0x7f5542a9d9e0>)
When launching several jobs in parallel and each job has multiple parallel envs, it's easy to get the error below. I feel like this has something to do with CPU memory with a large subprocess number, but am not sure. It might just be a result of being killed by the server due to memory limit.
Sometimes the error message becomes: