Closed taylorlulu closed 1 year ago
I met the same problem when I tried to run the Hierarchical RL Starter Code. Which seems to be the same problem caused by EOFError. The error message is as below:
2023-09-19 17:23:13,264 agent number of parameters: 8441617
Traceback (most recent call last):
File "habitat_baselines/run.py", line 81, in <module>
main()
File "habitat_baselines/run.py", line 40, in main
run_exp(**vars(args))
File "habitat_baselines/run.py", line 77, in run_exp
execute_exp(config, run_type)
File "habitat_baselines/run.py", line 60, in execute_exp
trainer.train()
File "/home/lu/.conda/envs/habitat/lib/python3.7/contextlib.py", line 74, in inner
return func(*args, **kwds)
File "/home/lu/Desktop/embodied_ai/hab-mobile-manipulation/habitat-lab/habitat_baselines/rl/ppo/ppo_trainer.py", line 715, in train
self._init_train()
File "/home/lu/Desktop/embodied_ai/hab-mobile-manipulation/habitat-lab/habitat_baselines/rl/ppo/ppo_trainer.py", line 318, in _init_train
observations = self.envs.reset()
File "/home/lu/Desktop/embodied_ai/hab-mobile-manipulation/habitat-lab/habitat/core/vector_env.py", line 384, in reset
results.append(read_fn())
File "/home/lu/Desktop/embodied_ai/hab-mobile-manipulation/habitat-lab/habitat/core/vector_env.py", line 103, in __call__
res = self.read_fn()
File "/home/lu/Desktop/embodied_ai/hab-mobile-manipulation/habitat-lab/habitat/utils/pickle5_multiprocessing.py", line 68, in recv
buf = self.recv_bytes()
File "/home/lu/.conda/envs/habitat/lib/python3.7/multiprocessing/connection.py", line 216, in recv_bytes
buf = self._recv_bytes(maxlength)
File "/home/lu/.conda/envs/habitat/lib/python3.7/multiprocessing/connection.py", line 407, in _recv_bytes
buf = self._recv(4)
File "/home/lu/.conda/envs/habitat/lib/python3.7/multiprocessing/connection.py", line 383, in _recv
raise EOFError
EOFError
Exception ignored in: <function VectorEnv.__del__ at 0x7f428cea0ef0>
Traceback (most recent call last):
File "/home/lu/Desktop/embodied_ai/hab-mobile-manipulation/habitat-lab/habitat/core/vector_env.py", line 584, in __del__
self.close()
File "/home/lu/Desktop/embodied_ai/hab-mobile-manipulation/habitat-lab/habitat/core/vector_env.py", line 452, in close
read_fn()
File "/home/lu/Desktop/embodied_ai/hab-mobile-manipulation/habitat-lab/habitat/core/vector_env.py", line 103, in __call__
res = self.read_fn()
File "/home/lu/Desktop/embodied_ai/hab-mobile-manipulation/habitat-lab/habitat/utils/pickle5_multiprocessing.py", line 68, in recv
buf = self.recv_bytes()
File "/home/lu/.conda/envs/habitat/lib/python3.7/multiprocessing/connection.py", line 216, in recv_bytes
buf = self._recv_bytes(maxlength)
File "/home/lu/.conda/envs/habitat/lib/python3.7/multiprocessing/connection.py", line 407, in _recv_bytes
buf = self._recv(4)
File "/home/lu/.conda/envs/habitat/lib/python3.7/multiprocessing/connection.py", line 383, in _recv
raise EOFError
EOFError:
And I solved the problem by referring to this issue https://github.com/facebookresearch/habitat-lab/issues/1149.
I changed the
into
envs = ThreadedVectorEnv(
make_env_fn=make_gym_from_config,
env_fn_args=tuple((c,) for c in configs),
workers_ignore_signals=workers_ignore_signals,
)
I don't know what caused the problem and how to use VectorEnv instead. But the problem may be solved by this way.
I'm using Habitat-Sim:0.2.2 Habitat: 0.2.2 Ubuntu: 20.04.
When I run ` export MAGNUM_LOG=quiet export HABITAT_SIM_LOG=quiet
set -x python habitat_baselines/run.py \ --exp-config ../habitat-challenge/configs/methods/ddppo_monolithic.yaml \ --run-type train \ BASE_TASK_CONFIG_PATH ../habitat-challenge/configs/tasks/rearrange.local.rgbd.yaml \ TASK_CONFIG.DATASET.SPLIT 'train' \ TASK_CONFIG.TASK.TASK_SPEC_BASE_PATH ../habitat-challenge/configs/pddl/ \ TENSORBOARD_DIR ./tb \ CHECKPOINT_FOLDER ./checkpoints \ LOG_FILE ./train.log`
it reported error as followed: Traceback (most recent call last): File "habitat-lab/habitat_baselines/run.py", line 81, in
main()
File "habitat-lab/habitat_baselines/run.py", line 40, in main
run_exp(*vars(args))
File "habitat-lab/habitat_baselines/run.py", line 77, in run_exp
execute_exp(config, run_type)
File "habitat-lab/habitat_baselines/run.py", line 60, in execute_exp
trainer.train()
File "/home/lu/.conda/envs/habitat/lib/python3.7/contextlib.py", line 74, in inner
return func(args, **kwds)
File "/home/lu/Desktop/embodied_ai/hab-mobile-manipulation/habitat-lab/habitat_baselines/rl/ppo/ppo_trainer.py", line 715, in train
self._init_train()
File "/home/lu/Desktop/embodied_ai/hab-mobile-manipulation/habitat-lab/habitat_baselines/rl/ppo/ppo_trainer.py", line 254, in _init_train
self._init_envs()
File "/home/lu/Desktop/embodied_ai/hab-mobile-manipulation/habitat-lab/habitat_baselines/rl/ppo/ppo_trainer.py", line 204, in _init_envs
workers_ignore_signals=is_slurm_batch_job(),
File "/home/lu/Desktop/embodied_ai/hab-mobile-manipulation/habitat-lab/habitat_baselines/common/construct_vector_env.py", line 97, in construct_envs
workers_ignore_signals=workers_ignore_signals,
File "/home/lu/Desktop/embodied_ai/hab-mobile-manipulation/habitat-lab/habitat/core/vector_env.py", line 200, in init
read_fn() for read_fn in self._connection_read_fns
File "/home/lu/Desktop/embodied_ai/hab-mobile-manipulation/habitat-lab/habitat/core/vector_env.py", line 200, in
read_fn() for read_fn in self._connection_read_fns
File "/home/lu/Desktop/embodied_ai/hab-mobile-manipulation/habitat-lab/habitat/core/vector_env.py", line 103, in call
res = self.read_fn()
File "/home/lu/Desktop/embodied_ai/hab-mobile-manipulation/habitat-lab/habitat/utils/pickle5_multiprocessing.py", line 68, in recv
buf = self.recv_bytes()
File "/home/lu/.conda/envs/habitat/lib/python3.7/multiprocessing/connection.py", line 216, in recv_bytes
buf = self._recv_bytes(maxlength)
File "/home/lu/.conda/envs/habitat/lib/python3.7/multiprocessing/connection.py", line 407, in _recv_bytes
buf = self._recv(4)
File "/home/lu/.conda/envs/habitat/lib/python3.7/multiprocessing/connection.py", line 379, in _recv
chunk = read(handle, remaining)
ConnectionResetError: [Errno 104] Connection reset by peer
Exception ignored in: <function VectorEnv.del at 0x7fafedb180e0>
Traceback (most recent call last):
File "/home/lu/Desktop/embodied_ai/hab-mobile-manipulation/habitat-lab/habitat/core/vector_env.py", line 584, in del
self.close()
File "/home/lu/Desktop/embodied_ai/hab-mobile-manipulation/habitat-lab/habitat/core/vector_env.py", line 452, in close
read_fn()
File "/home/lu/Desktop/embodied_ai/hab-mobile-manipulation/habitat-lab/habitat/core/vector_env.py", line 103, in call
res = self.read_fn()
File "/home/lu/Desktop/embodied_ai/hab-mobile-manipulation/habitat-lab/habitat/utils/pickle5_multiprocessing.py", line 68, in recv
buf = self.recv_bytes()
File "/home/lu/.conda/envs/habitat/lib/python3.7/multiprocessing/connection.py", line 216, in recv_bytes
buf = self._recv_bytes(maxlength)
File "/home/lu/.conda/envs/habitat/lib/python3.7/multiprocessing/connection.py", line 407, in _recv_bytes
buf = self._recv(4)
File "/home/lu/.conda/envs/habitat/lib/python3.7/multiprocessing/connection.py", line 383, in _recv
raise EOFError
EOFError:
I am using a single GPU RTX3070, dose it have something to do with my GPU or pytorch version. I would appreciate a lot if anyone can help.