Issues with gym video logging #1141

Closed vwxyzjn closed 4 years ago

vwxyzjn commented 4 years ago

wandb --version && python --version && uname wandb, version 0.9.1 Python 3.7.4 Linux


There are two issues with the current support for gym logging.

1) Inconsistent slider steps for the video panel: The media slider does not allow to use a custom x-axix, resulting in drastic different scale of x-axis for value-based methods vs policy gradient methods. Here are some examples:

The video panel of DQN


The video panel of PPO (PPO does less gradient update and therefore step is lower


As we can see, the slider step for the video panel is drastically different even though both scripts ran for 10M timesteps and there are roughly 17 videos for both scripts.

2) Bugs of video logging with SubprocVecEnv

SubprocVecEnv is a common module from openai/baselines (https://github.com/openai/baselines/blob/master/baselines/common/vec_env/subproc_vec_env.py), which allows the use of subprocesses to speed up sampling collection from parallelized gym environments. However, this causes issues for wandb video logging.

Not using SubprocVecEnv



Using SubprocVecEnv

https://app.wandb.ai/cleanrl/cleanRL/runs/rmqlq8m4/overview?workspace=user-costa-huang When using the SubprocVecEnv, not only the steps of the slider becomes incorrect. More importantly, when you have more videos, it does not even allow you to slide over all the videos.


What I Did

# subprocessenv.py
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.distributions.categorical import Categorical
from torch.utils.tensorboard import SummaryWriter

import argparse
from distutils.util import strtobool
import numpy as np
import gym
from gym.wrappers import TimeLimit, Monitor
import pybullet_envs
from gym.spaces import Discrete, Box, MultiBinary, MultiDiscrete, Space
import time
import random
import os
from stable_baselines3.common.vec_env import DummyVecEnv, SubprocVecEnv, VecEnvWrapper

import matplotlib
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
from PIL import Image

if __name__ == "__main__":
    parser = argparse.ArgumentParser(description='PPO agent')

    parser.add_argument('--use-subprocess', type=lambda x:bool(strtobool(x)), default=False, nargs='?', const=True,
                        help='weather to use `SubprocVecEnv`')

    # Common arguments
    parser.add_argument('--exp-name', type=str, default=os.path.basename(__file__).rstrip(".py"),
                        help='the name of this experiment')
    parser.add_argument('--gym-id', type=str, default="BreakoutNoFrameskip-v4",
                        help='the id of the gym environment')
    parser.add_argument('--learning-rate', type=float, default=2.5e-4,
                        help='the learning rate of the optimizer')
    parser.add_argument('--seed', type=int, default=1,
                        help='seed of the experiment')
    parser.add_argument('--total-timesteps', type=int, default=20000000,
                        help='total timesteps of the experiments')
    parser.add_argument('--torch-deterministic', type=lambda x:bool(strtobool(x)), default=True, nargs='?', const=True,
                        help='if toggled, `torch.backends.cudnn.deterministic=False`')
    parser.add_argument('--cuda', type=lambda x:bool(strtobool(x)), default=True, nargs='?', const=True,
                        help='if toggled, cuda will not be enabled by default')
    parser.add_argument('--prod-mode', type=lambda x:bool(strtobool(x)), default=False, nargs='?', const=True,
                        help='run the script in production mode and use wandb to log outputs')
    parser.add_argument('--capture-video', type=lambda x:bool(strtobool(x)), default=False, nargs='?', const=True,
                        help='weather to capture videos of the agent performances (check out `videos` folder)')
    parser.add_argument('--wandb-project-name', type=str, default="cleanRL",
                        help="the wandb's project name")
    parser.add_argument('--wandb-entity', type=str, default=None,
                        help="the entity (team) of wandb's project")

    # Algorithm specific arguments
    parser.add_argument('--n-minibatch', type=int, default=4,
                        help='the number of mini batch')
    parser.add_argument('--num-envs', type=int, default=8,
                        help='the number of parallel game environment')
    parser.add_argument('--num-steps', type=int, default=128,
                        help='the number of steps per game environment')
    args = parser.parse_args()
    if not args.seed:
        args.seed = int(time.time())

args.batch_size = int(args.num_envs * args.num_steps)
args.minibatch_size = int(args.batch_size // args.n_minibatch)

class VecPyTorch(VecEnvWrapper):
    def __init__(self, venv, device):
        super(VecPyTorch, self).__init__(venv)
        self.device = device

    def reset(self):
        obs = self.venv.reset()
        obs = torch.from_numpy(obs).float().to(self.device)
        return obs

    def step_async(self, actions):
        actions = actions.cpu().numpy()

    def step_wait(self):
        obs, reward, done, info = self.venv.step_wait()
        obs = torch.from_numpy(obs).float().to(self.device)
        reward = torch.from_numpy(reward).unsqueeze(dim=1).float()
        return obs, reward, done, info

# TRY NOT TO MODIFY: setup the environment
experiment_name = f"{args.gym_id}__{args.exp_name}__{args.seed}__{int(time.time())}"
writer = SummaryWriter(f"runs/{experiment_name}")
writer.add_text('hyperparameters', "|param|value|\n|-|-|\n%s" % (
        '\n'.join([f"|{key}|{value}|" for key, value in vars(args).items()])))
if args.prod_mode:
    import wandb
    wandb.init(project=args.wandb_project_name, entity=args.wandb_entity, sync_tensorboard=True, config=vars(args), name=experiment_name, monitor_gym=True, save_code=True)
    writer = SummaryWriter(f"/tmp/{experiment_name}")

# TRY NOT TO MODIFY: seeding
device = torch.device('cuda' if torch.cuda.is_available() and args.cuda else 'cpu')
torch.backends.cudnn.deterministic = args.torch_deterministic
def make_env(gym_id, seed, idx):
    def thunk():
        env = gym.make(gym_id)
        env = gym.wrappers.RecordEpisodeStatistics(env)
        if args.capture_video:
            if idx == 0:
                env = Monitor(env, f'videos/{experiment_name}', video_callable=lambda episode_id: episode_id%10==0)
        return env
    return thunk
if args.prod_mode and args.use_subprocess:
    envs = VecPyTorch(
        SubprocVecEnv([make_env(args.gym_id, args.seed+i, i) for i in range(args.num_envs)], "fork"),
    envs = VecPyTorch(DummyVecEnv([make_env(args.gym_id, args.seed+i, i) for i in range(args.num_envs)]), device)
assert isinstance(envs.action_space, Discrete), "only discrete action space is supported"

# ALGO Logic: Storage for epoch data
obs = torch.zeros((args.num_steps, args.num_envs) + envs.observation_space.shape).to(device)
actions = torch.zeros((args.num_steps, args.num_envs) + envs.action_space.shape).to(device)
logprobs = torch.zeros((args.num_steps, args.num_envs)).to(device)
rewards = torch.zeros((args.num_steps, args.num_envs)).to(device)
dones = torch.zeros((args.num_steps, args.num_envs)).to(device)
values = torch.zeros((args.num_steps, args.num_envs)).to(device)

# TRY NOT TO MODIFY: start the game
global_step = 0
# Note how `next_obs` and `next_done` are used; their usage is equivalent to
# https://github.com/ikostrikov/pytorch-a2c-ppo-acktr-gail/blob/84a7582477fb0d5c82ad6d850fe476829dddd2e1/a2c_ppo_acktr/storage.py#L60
next_obs = envs.reset()
next_done = torch.zeros(args.num_envs).to(device)
num_updates = args.total_timesteps // args.batch_size
for update in range(1, num_updates+1):

    # TRY NOT TO MODIFY: prepare the execution of the game.
    for step in range(0, args.num_steps):
        global_step += 1 * args.num_envs

        # TRY NOT TO MODIFY: execute the game and log data.
        next_obs, rs, ds, infos = envs.step(torch.randint(0, envs.action_space.n, (envs.num_envs,)))
        rewards[step], next_done = rs.view(-1), torch.Tensor(ds).to(device)

        for info in infos:
            if 'episode' in info.keys():
                print(f"global_step={global_step}, episode_reward={info['episode']['r']}")
                writer.add_scalar("charts/episode_reward", info['episode']['r'], global_step)
for seed in {1..1}
    (sleep 0.3 && nohup xvfb-run -a python subprocessenv.py \
    --gym-id BreakoutNoFrameskip-v4 \
    --total-timesteps 1000000 \
    --wandb-project-name cleanrl \
    --use-subprocess True \
    --prod-mode \
    --capture-video \
    --seed $seed
    ) >& /dev/null &

for seed in {1..1}
    (sleep 0.3 && nohup xvfb-run -a python subprocessenv.py \
    --gym-id BreakoutNoFrameskip-v4 \
    --total-timesteps 1000000 \
    --wandb-project-name cleanrl \
    --use-subprocess False \
    --prod-mode \
    --capture-video \
    --seed $seed
    ) >& /dev/null &

Proposed solutoin

It seems to me that the most straight forward approach is to allow the use of custom x-axis for the steps of the slider, which is related to issue https://github.com/wandb/client/issues/1093

cvphelps commented 4 years ago

Hi Costa, thanks for the awesome detailed bug report, and solid suggestion on the solution here.

vwxyzjn commented 4 years ago

@cvphelps Thanks for the kind words. I am still trying to work on my project on Open RL Benchmark (https://app.wandb.ai/cleanrl/cleanrl.benchmark/reports/benchmark--Vmlldzo0MDcxOA). I hope I will have a major release soon so that I could submit it for potentially being featured in the wandb gallery :)

cvphelps commented 4 years ago

Great, thanks again. Closing this as a duplicate of this ticket: https://github.com/wandb/client/issues/1093