simpler-env / SimplerEnv

Evaluating and reproducing real-world robot manipulation policies (e.g., RT-1, RT-1-X, Octo) in simulation under common setups (e.g., Google Robot, WidowX+Bridge)
https://simpler-env.github.io/
MIT License
311 stars 41 forks source link

Error when evaluation in parallel environments #36

Closed MasterXiong closed 1 month ago

MasterXiong commented 2 months ago

Hi,

I'm trying to evaluate an octo-based policy in several simpler environments in parallel to accelerate the evaluation process. I generally use python's built-in multiprocessing. A minimal code example is as below:

import numpy as np
from multiprocessing import Process, Pipe
import simpler_env

def worker(remote, parent_remote, env_name):
    parent_remote.close()  # Close the parent end of the pipe
    env = simpler_env.make(env_name)
    while True:
        cmd, data = remote.recv()
        if cmd == 'step':
            obs, reward, success, truncated, info = env.step(data)
            # if done:
            #     obs = env.reset()
            remote.send((obs, reward, success, truncated, info))
        elif cmd == 'reset':
            obs, reset_info = env.reset()
            remote.send((obs, reset_info))
        elif cmd == 'close':
            env.close()
            remote.close()
            break

class ParallelEnvs:
    def __init__(self, env_name, num_envs):
        self.num_envs = num_envs
        self.remotes, self.work_remotes = zip(*[Pipe() for _ in range(num_envs)])
        self.processes = [Process(target=worker, args=(work_remote, remote, env_name))
                          for (work_remote, remote) in zip(self.work_remotes, self.remotes)]
        for p in self.processes:
            p.start()
        for work_remote in self.work_remotes:
            work_remote.close()  # Close the worker end in the main process

    def step(self, actions):
        for remote, action in zip(self.remotes, actions):
            remote.send(('step', action))
        results = [remote.recv() for remote in self.remotes]
        obs, reward, success, truncated, info = zip(*results)
        return obs, reward, success, truncated, info

    def reset(self):
        for remote in self.remotes:
            remote.send(('reset', None))
        obs, reset_info = [remote.recv() for remote in self.remotes]
        return obs, reset_info

    def close(self):
        for remote in self.remotes:
            remote.send(('close', None))
        for p in self.processes:
            p.join()

if __name__ == "__main__":

    env_name = "google_robot_pick_coke_can"
    num_envs = 4  # Number of parallel environments
    num_steps = 100
    dummy_env = simpler_env.make(env_name)

    # Initialize parallel environments
    envs = ParallelEnvs(env_name, num_envs)

    # Reset all environments
    obs, reset_info = envs.reset()
    breakpoint()

    for step in range(num_steps):
        # Get actions from the policy
        actions = [dummy_env.action_space.sample() for _ in range(num_envs)]

        # Step all environments with the actions
        obs, reward, success, truncated, info = envs.step(actions)
        breakpoint()

    # Close the environments
    envs.close()

But I got the following error when initializing multiple environments:

  File "/user/fine-tune/test.py", line 15, in worker                                                                                                                 [15/1901]
    env = simpler_env.make(env_name)
  File "/SimplerEnv/simpler_env/__init__.py", line 78, in make
    env = gym.make(env_name, obs_mode="rgbd", **kwargs)
  File "/opt/conda/lib/python3.10/site-packages/gymnasium/envs/registration.py", line 802, in make
    env = env_creator(**env_spec_kwargs)
  File "/SimplerEnv/ManiSkill2_real2sim/mani_skill2_real2sim/utils/registration.py", line 92, in make
    env = env_spec.make(**kwargs)
  File "/SimplerEnv/ManiSkill2_real2sim/mani_skill2_real2sim/utils/registration.py", line 34, in make
    return self.cls(**_kwargs)
  File "/SimplerEnv/ManiSkill2_real2sim/mani_skill2_real2sim/envs/custom_scenes/grasp_single_in_scene.py", line 630, in __init__
    super().__init__(**kwargs)
  File "/SimplerEnv/ManiSkill2_real2sim/mani_skill2_real2sim/envs/custom_scenes/grasp_single_in_scene.py", line 540, in __init__
    super().__init__(**kwargs)
  File "/SimplerEnv/ManiSkill2_real2sim/mani_skill2_real2sim/envs/custom_scenes/grasp_single_in_scene.py", line 64, in __init__
    super().__init__(**kwargs)
  File "/SimplerEnv/ManiSkill2_real2sim/mani_skill2_real2sim/envs/custom_scenes/base_env.py", line 134, in __init__
    super().__init__(**kwargs)
  File "/SimplerEnv/ManiSkill2_real2sim/mani_skill2_real2sim/envs/sapien_env.py", line 188, in __init__
    obs, _ = self.reset(seed=2022, options=dict(reconfigure=True))
  File "/SimplerEnv/ManiSkill2_real2sim/mani_skill2_real2sim/envs/custom_scenes/grasp_single_in_scene.py", line 585, in reset
    obs, info = super().reset(seed=self._episode_seed, options=options)
  File "/SimplerEnv/ManiSkill2_real2sim/mani_skill2_real2sim/envs/custom_scenes/grasp_single_in_scene.py", line 135, in reset
    obs, info = super().reset(seed=self._episode_seed, options=options)
  File "/SimplerEnv/ManiSkill2_real2sim/mani_skill2_real2sim/envs/custom_scenes/base_env.py", line 228, in reset
    obs, info = super().reset(seed=seed, options=options)
  File "/SimplerEnv/ManiSkill2_real2sim/mani_skill2_real2sim/envs/sapien_env.py", line 488, in reset
    return self.get_obs(), {}
  File "/SimplerEnv/ManiSkill2_real2sim/mani_skill2_real2sim/envs/custom_scenes/base_env.py", line 350, in get_obs
    obs = super().get_obs()
  File "/SimplerEnv/ManiSkill2_real2sim/mani_skill2_real2sim/envs/sapien_env.py", line 265, in get_obs
    return self._get_obs_images()
  File "/SimplerEnv/ManiSkill2_real2sim/mani_skill2_real2sim/envs/sapien_env.py", line 312, in _get_obs_images
    self.take_picture()
  File "/SimplerEnv/ManiSkill2_real2sim/mani_skill2_real2sim/envs/sapien_env.py", line 289, in take_picture
    cam.take_picture()
  File "/SimplerEnv/ManiSkill2_real2sim/mani_skill2_real2sim/sensors/camera.py", line 187, in take_picture
    self.camera.take_picture()
RuntimeError: vk::Device::waitForFences: ErrorDeviceLost

Could you help have a look at what is the issue here? Or what is the right way to parallalize simpler environments? Thanks for your help!

P.S. This link may be relevant to my issue here.

StoneT2000 commented 1 month ago

You can follow the progress here: #38

main blocker is making octo accept batched inputs atm

MasterXiong commented 1 month ago

@StoneT2000 Thanks for your help! The GPU simulation currently only support widowx tasks right?

StoneT2000 commented 1 month ago

Yes only the widowx robot. We don't have an implementation of a GPU parallelized version of the google robot's controller. I know it's possible but currently don't have time to tackle that problem just yet.