openai / baselines

OpenAI Baselines: high-quality implementations of reinforcement learning algorithms
MIT License
15.65k stars 4.86k forks source link

Her.RolloutWorker not finding env.reset method #1083

Closed rickstaa closed 4 years ago

rickstaa commented 4 years ago

I am currently trying to use the HER algorithm for training a fetch robot. I do this using the train.py script below (see this repository for the full code).

#! /usr/bin/env python
"""
Uses HER algorithm which stands for H*indsight *Experience Replay. This algorithm
uses the experience replay from the DQL but chooses the goal after the episode is over
(hindsignt).
"""
import os
import sys

import click
import numpy as np
import json
from mpi4py import MPI

from baselines import logger
from baselines.common import set_global_seeds
from baselines.common.mpi_moments import mpi_moments
import baselines.her.experiment.config as config
from baselines.her.rollout import RolloutWorker
from baselines.her.util import mpi_fork
from baselines.common.cmd_util import make_vec_env
from baselines.common.cmd_util import make_robotics_env
import gym

from subprocess import CalledProcessError

# from gym.envs.robotics.fetch import reach
import my_fetch_task_env
import rospy

def mpi_average(value):
    if value == []:
        value = [0.0]
    if not isinstance(value, list):
        value = [value]
    return mpi_moments(np.array(value))[0]

def train(
    policy,
    rollout_worker,
    evaluator,
    n_epochs,
    n_test_rollouts,
    n_cycles,
    n_batches,
    policy_save_interval,
    save_policies,
    **kwargs
):
    rank = MPI.COMM_WORLD.Get_rank()

    latest_policy_path = os.path.join(logger.get_dir(), "policy_latest.pkl")
    best_policy_path = os.path.join(logger.get_dir(), "policy_best.pkl")
    periodic_policy_path = os.path.join(logger.get_dir(), "policy_{}.pkl")

    logger.info("Training...")
    best_success_rate = -1
    for epoch in range(n_epochs):
        # train
        rollout_worker.clear_history()
        for _ in range(n_cycles):
            episode = rollout_worker.generate_rollouts()
            policy.store_episode(episode)
            for _ in range(n_batches):
                policy.train()
            policy.update_target_net()

        # test
        evaluator.clear_history()
        for _ in range(n_test_rollouts):
            evaluator.generate_rollouts()

        # record logs
        logger.record_tabular("epoch", epoch)
        for key, val in evaluator.logs("test"):
            logger.record_tabular(key, mpi_average(val))
        for key, val in rollout_worker.logs("train"):
            logger.record_tabular(key, mpi_average(val))
        for key, val in policy.logs():
            logger.record_tabular(key, mpi_average(val))

        if rank == 0:
            logger.dump_tabular()

        # save the policy if it's better than the previous ones
        success_rate = mpi_average(evaluator.current_success_rate())
        if rank == 0 and success_rate >= best_success_rate and save_policies:
            best_success_rate = success_rate
            logger.info(
                "New best success rate: {}. Saving policy to {} ...".format(
                    best_success_rate, best_policy_path
                )
            )
            evaluator.save_policy(best_policy_path)
            evaluator.save_policy(latest_policy_path)
        if (
            rank == 0
            and policy_save_interval > 0
            and epoch % policy_save_interval == 0
            and save_policies
        ):
            policy_path = periodic_policy_path.format(epoch)
            logger.info("Saving periodic policy to {} ...".format(policy_path))
            evaluator.save_policy(policy_path)

        # make sure that different threads have different seeds
        local_uniform = np.random.uniform(size=(1,))
        root_uniform = local_uniform.copy()
        MPI.COMM_WORLD.Bcast(root_uniform, root=0)
        if rank != 0:
            assert local_uniform[0] != root_uniform[0]

def launch(
    env,
    logdir,
    n_epochs,
    num_cpu,
    seed,
    replay_strategy,
    policy_save_interval,
    clip_return,
    override_params={},
    save_policies=True,
):
    # Fork for multi-CPU MPI implementation.
    if num_cpu > 1:
        try:
            whoami = mpi_fork(num_cpu, ["--bind-to", "core"])
        except CalledProcessError:
            # fancy version of mpi call failed, try simple version
            whoami = mpi_fork(num_cpu)

        if whoami == "parent":
            sys.exit(0)
        import baselines.common.tf_util as U

        U.single_threaded_session().__enter__()
    rank = MPI.COMM_WORLD.Get_rank()

    # Configure logging
    if rank == 0:
        if logdir or logger.get_dir() is None:
            logger.configure(dir=logdir)
    else:
        logger.configure()
    logdir = logger.get_dir()
    assert logdir is not None
    os.makedirs(logdir, exist_ok=True)

    # Seed everything.
    rank_seed = seed + 1000000 * rank
    set_global_seeds(rank_seed)

    # Prepare params.
    params = config.DEFAULT_PARAMS
    params["env_name"] = env
    params["replay_strategy"] = replay_strategy
    if env in config.DEFAULT_ENV_PARAMS:
        params.update(
            config.DEFAULT_ENV_PARAMS[env]
        )  # merge env-specific parameters in
    params.update(**override_params)  # makes it possible to override any parameter
    with open(os.path.join(logger.get_dir(), "params.json"), "w") as f:
        json.dump(params, f)
    params = config.prepare_params(params)
    config.log_params(params, logger=logger)

    if num_cpu == 1:
        logger.warn()
        logger.warn("*** Warning ***")
        logger.warn(
            "You are running HER with just a single MPI worker. This will work, but the "
            + "experiments that we report in Plappert et al. (2018, https://arxiv.org/abs/1802.09464) "
            + "were obtained with --num_cpu 19. This makes a significant difference and if you "
            + "are looking to reproduce those results, be aware of this. Please also refer to "
            + "https://github.com/openai/baselines/issues/314 for further details."
        )
        logger.warn("****************")
        logger.warn()

    dims = config.configure_dims(params)
    policy = config.configure_ddpg(dims=dims, params=params, clip_return=clip_return)

    rollout_params = {
        "exploit": False,
        "use_target_net": False,
        "use_demo_states": True,
        "compute_Q": False,
        "T": params["T"],
    }

    eval_params = {
        "exploit": True,
        "use_target_net": params["test_with_polyak"],
        "use_demo_states": False,
        "compute_Q": True,
        "T": params["T"],
    }

    for name in ["T", "rollout_batch_size", "gamma", "noise_eps", "random_eps"]:
        rollout_params[name] = params[name]
        eval_params[name] = params[name]

    # FIXME: VERSION 1: Use params["make_env"] variable to create environment
    env_vec = params["make_env"]

    # FIXME: VERSION 2: Initialize vector environment using make_vect_env class
    # env_vec = make_vec_env(env, 'robotics', num_env=1, seed=None)
    # env_vec = make_vec_env(env, 'robotics', num_env=1, seed=rank_seed)

    # FIXME: try2: Initialize vector environment using the make_robotics_env class
    # env_vec = make_robotics_env(env, seed=rank_seed, rank=rank)

    # FIXME: try3: Initialize vector environment using the original gym.make method
    env_vec = gym.make(env)
    env_vec.seed(rank_seed)

    # Create Rollout worker
    rollout_worker = RolloutWorker(
        env_vec, policy, dims, logger, **rollout_params
    )
    rollout_worker.seed(rank_seed)

    evaluator = RolloutWorker(env_vec, policy, dims, logger, **eval_params)
    evaluator.seed(rank_seed)

    train(
        logdir=logdir,
        policy=policy,
        rollout_worker=rollout_worker,
        evaluator=evaluator,
        n_epochs=n_epochs,
        n_test_rollouts=params["n_test_rollouts"],
        n_cycles=params["n_cycles"],
        n_batches=params["n_batches"],
        policy_save_interval=policy_save_interval,
        save_policies=save_policies,
    )

@click.command()
@click.option(
    "--env",
    type=str,
    default="FetchReach-v0",
    help="the name of the OpenAI Gym environment that you want to train on",
)
@click.option(
    "--logdir",
    type=str,
    default=None,
    help="the path to where logs and policy pickles should go. If not specified, creates a folder in /tmp/",
)
@click.option(
    "--n_epochs", type=int, default=50, help="the number of training epochs to run"
)
@click.option(
    "--num_cpu", type=int, default=1, help="the number of CPU cores to use (using MPI)"
)
@click.option(
    "--seed",
    type=int,
    default=0,
    help="the random seed used to seed both the environment and the training code",
)
@click.option(
    "--policy_save_interval",
    type=int,
    default=5,
    help="the interval with which policy pickles are saved. If set to 0, only the best and latest policy will be pickled.",
)
@click.option(
    "--replay_strategy",
    type=click.Choice(["future", "none"]),
    default="future",
    help='the HER replay strategy to be used. "future" uses HER, "none" disables HER.',
)
@click.option(
    "--clip_return",
    type=int,
    default=1,
    help="whether or not returns should be clipped",
)
def main(**kwargs):
    rospy.init_node("train_fetch_her")
    launch(**kwargs)

if __name__ == "__main__":
    main()

Unfortunately, I when running the train.py file I get the following error message:

AttributeError: 'function' object has no attribute 'reset' inf rollout.py

The following two solutions, that can be used to get rid of this error, were given in #798 by @pzhokhov:

These solutions, however, did not solve the problem for me (see the report of each solution below).

Option 1: Revert back to the old version 146bbf886ba533fe08b07e01d1c0356aaf7fcc80:

I now run into the following error, when running the train.py file:

ModuleNotFoundError: No module named 'mujoco_py'

As I would rather add modifications to my train.py script than buying an additional mujoco license, I tried the second solution.

Option 2: Initiate environment before initializing RolloutWorker

I therefore tried adding the following code before the RolloutWorker initiation:

from baselines.common.cmd_util import make_vec_env
env = make_vec_env(env, 'robotics', num_env=1, seed=None)

but when running the train_modified.py script I now receive the following error:

Traceback (most recent call last):
  File "./src/my_fetch_train/scripts/train_modified.py", line 280, in <module>
    main()
  File "/home/ricks/.catkin_ws_python3/openai_venv/lib/python3.6/site-packages/click/core.py", line 764, in __call__
    return self.main(*args, **kwargs)
  File "/home/ricks/.catkin_ws_python3/openai_venv/lib/python3.6/site-packages/click/core.py", line 717, in main
    rv = self.invoke(ctx)
  File "/home/ricks/.catkin_ws_python3/openai_venv/lib/python3.6/site-packages/click/core.py", line 956, in invoke
    return ctx.invoke(self.callback, **ctx.params)
  File "/home/ricks/.catkin_ws_python3/openai_venv/lib/python3.6/site-packages/click/core.py", line 555, in invoke
    return callback(*args, **kwargs)
  File "./src/my_fetch_train/scripts/train_modified.py", line 276, in main
    launch(**kwargs)
  File "./src/my_fetch_train/scripts/train_modified.py", line 210, in launch
    env_vec, policy, dims, logger, **rollout_params
  File "/home/ricks/Development/robot_academy_ws/src/baselines/baselines/her/util.py", line 36, in wrapper
    return method(*positional_args, **keyword_args)
  File "/home/ricks/Development/robot_academy_ws/src/baselines/baselines/her/rollout.py", line 41, in __init__
    self.reset_all_rollouts()
  File "/home/ricks/Development/robot_academy_ws/src/baselines/baselines/her/rollout.py", line 46, in reset_all_rollouts
    self.initial_o = self.obs_dict['observation']
IndexError: only integers, slices (`:`), ellipsis (`...`), numpy.newaxis (`None`) and integer or boolean arrays are valid indices

I presume this is caused since the DummyEnv class is used instead of my own environment. As a result, I tried the following code:

from baselines.common.cmd_util import make_robotics_env
env_vec= make_robotics_env(env, seed=rank_seed, rank=rank)

But this also gave me the error above. Finally, I tried inputting a normal gym.env instead of a vectorized env by using the following code:

    rollout_worker = RolloutWorker(
        env_vec, policy, dims, logger, **rollout_params
    )
    rollout_worker.seed(rank_seed)

But when doing this, I receive the following error:

Traceback (most recent call last):
  File "./src/my_fetch_train/scripts/train_modified.py", line 289, in <module>
    main()
  File "/home/ricks/.catkin_ws_python3/openai_venv/lib/python3.6/site-packages/click/core.py", line 764, in __call__
    return self.main(*args, **kwargs)
  File "/home/ricks/.catkin_ws_python3/openai_venv/lib/python3.6/site-packages/click/core.py", line 717, in main
    rv = self.invoke(ctx)
  File "/home/ricks/.catkin_ws_python3/openai_venv/lib/python3.6/site-packages/click/core.py", line 956, in invoke
    return ctx.invoke(self.callback, **ctx.params)
  File "/home/ricks/.catkin_ws_python3/openai_venv/lib/python3.6/site-packages/click/core.py", line 555, in invoke
    return callback(*args, **kwargs)
  File "./src/my_fetch_train/scripts/train_modified.py", line 285, in main
    launch(**kwargs)
  File "./src/my_fetch_train/scripts/train_modified.py", line 221, in launch
    rollout_worker.seed(rank_seed)
AttributeError: 'RolloutWorker' object has no attribute 'seed'

I was, therefore, wondering if somebody has an example on how to use the HER algorithm with the master branch of the openai/baselines repository?

RyanRizzo96 commented 4 years ago

I was, therefore, wondering if somebody has an example on how to use the HER algorithm with the master branch of the openai/baselines repository?

I use a bash script containing:

python3 -m baselines.run --alg=her --env=FetchReach-v1 --num_timesteps=5000

Example here:

https://github.com/RyanRizzo96/RL_baselines

rickstaa commented 4 years ago

@RyanRizzo96 Thanks a lot for the example I will take a look at it.

rickstaa commented 4 years ago

My problems were solved by changing to the stable_baselines fork instead.

rickstaa commented 4 years ago

As I was contacted by multiple people on how I got it to work with the stable_baselines fork here a quick guide:

Example script

Following the original documentation, I will use the following example script:

from stable_baselines import HER, DQN, SAC, DDPG, TD3
from stable_baselines.her import GoalSelectionStrategy, HERGoalEnvWrapper
from stable_baselines.common.bit_flipping_env import BitFlippingEnv

model_class = DQN  # works also with SAC, DDPG and TD3
N_BITS = 2

env = BitFlippingEnv(
    N_BITS, continuous=model_class in [DDPG, SAC, TD3], max_steps=N_BITS
)

# Available strategies (cf paper): future, final, episode, random
goal_selection_strategy = "future"  # equivalent to GoalSelectionStrategy.FUTURE

# Wrap the model
model = HER(
    "MlpPolicy",
    env,
    model_class,
    n_sampled_goal=4,
    goal_selection_strategy=goal_selection_strategy,
    verbose=1,
)
# Train the model
model.learn(1000)

model.save("./her_bit_env")

# WARNING: you must pass an env
# or wrap your environment with HERGoalEnvWrapper to use the predict method
model = HER.load("./her_bit_env", env=env)

obs = env.reset()
for _ in range(100):
    action, _ = model.predict(obs)
    obs, reward, done, _ = env.step(action)

    if done:
        obs = env.reset()

Run instructions

  1. Clone the stable-baselines repository:
git clone https://github.com/hill-a/stable-baselines.git
  1. Create a python3.7 Conda environment:
conda create -n her_test python=3.7
  1. Activate the Conda environment:
conda activate her_test
  1. Install the stable-baselines package together with the required dependencies:
pip install .[mpi]
  1. Run the script and see it works.

:heavy_exclamation_mark: NOTE: The mpi tag is essential to get it to work since otherwise, you will receive the error explained in this issue when you try to run the

Final remarks

I did not check the aforementioned steps in other python versions. In other versions, problems still might occur. If anybody still runs into problems while using this guide feel free to contact me.